publish master branch snapshot, revision cdcab9d7ab48ffb0ee5629fabbfa06cb45debd9b
authorAlexey Suhov <alexey.suhov@intel.com>
Wed, 15 Apr 2020 16:01:57 +0000 (19:01 +0300)
committerAlexey Suhov <alexey.suhov@intel.com>
Wed, 15 Apr 2020 16:01:57 +0000 (19:01 +0300)
971 files changed:
.gitignore
.gitmodules
CMakeLists.txt
Jenkinsfile [new file with mode: 0644]
cmake/developer_package.cmake
inference-engine/cmake/add_ie_target.cmake
inference-engine/cmake/clang_format.cmake
inference-engine/cmake/dependencies.cmake
inference-engine/cmake/developer_package_config.cmake.in
inference-engine/cmake/features_ie.cmake
inference-engine/cmake/models.cmake [new file with mode: 0644]
inference-engine/cmake/plugins/plugins.cmake
inference-engine/cmake/vpu_dependencies.cmake
inference-engine/ie_bridges/c/samples/hello_nv12_input_classification/main.c
inference-engine/ie_bridges/c/samples/object_detection_sample_ssd/main.c
inference-engine/ie_bridges/c/src/CMakeLists.txt
inference-engine/ie_bridges/python/sample/object_detection_sample_ssd/object_detection_sample_ssd.py
inference-engine/ie_bridges/python/src/openvino/inference_engine/CMakeLists.txt
inference-engine/ie_bridges/python/src/openvino/inference_engine/ie_api.pyx
inference-engine/include/cpp/ie_cnn_net_reader.h
inference-engine/include/cpp/ie_cnn_network.h
inference-engine/include/details/ie_so_pointer.hpp
inference-engine/include/ie_icnn_net_reader.h
inference-engine/include/ie_icnn_network.hpp
inference-engine/include/ie_layers.h
inference-engine/include/ie_parameter.hpp
inference-engine/include/ie_plugin_config.hpp
inference-engine/include/ie_precision.hpp
inference-engine/samples/benchmark_app/README.md
inference-engine/samples/benchmark_app/benchmark_app.hpp
inference-engine/samples/benchmark_app/main.cpp
inference-engine/samples/benchmark_app/utils.cpp
inference-engine/samples/benchmark_app/utils.hpp
inference-engine/samples/hello_query_device/main.cpp
inference-engine/samples/speech_sample/main.cpp
inference-engine/scripts/run_code_checks.sh [new file with mode: 0644]
inference-engine/src/CMakeLists.txt
inference-engine/src/cldnn_engine/cldnn_config.cpp
inference-engine/src/cldnn_engine/cldnn_program.cpp
inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
inference-engine/src/gna_plugin/gna_executable_network.hpp
inference-engine/src/gna_plugin/gna_graph_compiler.cpp
inference-engine/src/gna_plugin/gna_graph_compiler.hpp
inference-engine/src/gna_plugin/gna_model_serial.cpp
inference-engine/src/gna_plugin/gna_plugin.cpp
inference-engine/src/gna_plugin/gna_plugin.hpp
inference-engine/src/gna_plugin/gna_plugin_config.cpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_plugin_config.hpp [new file with mode: 0644]
inference-engine/src/gna_plugin/gna_plugin_internal.hpp
inference-engine/src/gna_plugin/gna_plugin_policy.hpp
inference-engine/src/gna_plugin/gna_plugin_query_api.cpp
inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
inference-engine/src/hetero_plugin/hetero_async_infer_request.cpp
inference-engine/src/hetero_plugin/hetero_infer_request.cpp
inference-engine/src/hetero_plugin/hetero_infer_request.hpp
inference-engine/src/inference_engine/CMakeLists.txt
inference-engine/src/inference_engine/blob_factory.cpp
inference-engine/src/inference_engine/cnn_network_ngraph_impl.cpp
inference-engine/src/inference_engine/cnn_network_ngraph_impl.hpp [moved from inference-engine/src/plugin_api/cnn_network_ngraph_impl.hpp with 73% similarity]
inference-engine/src/inference_engine/generic_ie.cpp
inference-engine/src/inference_engine/ie_core.cpp
inference-engine/src/inference_engine/ie_rtti.cpp
inference-engine/src/inference_engine/ie_system_conf.cpp
inference-engine/src/inference_engine/os/lin/lin_system_conf.cpp
inference-engine/src/inference_engine/threading/ie_cpu_streams_executor.cpp
inference-engine/src/inference_engine/threading/ie_executor_manager.cpp
inference-engine/src/inference_engine/threading/ie_thread_affinity.cpp
inference-engine/src/ir_readers/CMakeLists.txt [new file with mode: 0644]
inference-engine/src/ir_readers/ie_blob_proxy.hpp [moved from inference-engine/src/inference_engine/ie_blob_proxy.hpp with 100% similarity]
inference-engine/src/ir_readers/ie_cnn_net_reader_impl.cpp [moved from inference-engine/src/inference_engine/ie_cnn_net_reader_impl.cpp with 92% similarity]
inference-engine/src/ir_readers/ie_cnn_net_reader_impl.h [moved from inference-engine/src/inference_engine/ie_cnn_net_reader_impl.h with 89% similarity]
inference-engine/src/ir_readers/ie_format_parser.cpp [moved from inference-engine/src/inference_engine/ie_format_parser.cpp with 99% similarity]
inference-engine/src/ir_readers/ie_format_parser.h [moved from inference-engine/src/inference_engine/ie_format_parser.h with 97% similarity]
inference-engine/src/ir_readers/ie_ir_parser.cpp [moved from inference-engine/src/inference_engine/ie_ir_parser.cpp with 99% similarity]
inference-engine/src/ir_readers/ie_ir_parser.hpp [moved from inference-engine/src/inference_engine/ie_ir_parser.hpp with 99% similarity]
inference-engine/src/ir_readers/ie_ir_reader.cpp [moved from inference-engine/src/inference_engine/ie_ir_reader.cpp with 100% similarity]
inference-engine/src/ir_readers/ie_ir_reader.hpp [moved from inference-engine/src/inference_engine/ie_ir_reader.hpp with 97% similarity]
inference-engine/src/ir_readers/ie_layer_parsers.cpp [moved from inference-engine/src/inference_engine/ie_layer_parsers.cpp with 100% similarity]
inference-engine/src/ir_readers/ie_layer_parsers.h [moved from inference-engine/src/inference_engine/ie_layer_parsers.h with 100% similarity]
inference-engine/src/ir_readers/parsers.h [moved from inference-engine/src/inference_engine/parsers.h with 100% similarity]
inference-engine/src/legacy_api/CMakeLists.txt
inference-engine/src/legacy_api/include/cnn_network_impl.hpp
inference-engine/src/legacy_api/include/convert_function_to_cnn_network.hpp [moved from inference-engine/src/plugin_api/convert_function_to_cnn_network.hpp with 68% similarity]
inference-engine/src/legacy_api/include/graph_transformer.h
inference-engine/src/legacy_api/include/ie_ngraph_utils.hpp [moved from inference-engine/src/inference_engine/ie_ngraph_utils.hpp with 94% similarity]
inference-engine/src/legacy_api/include/ie_util_internal.hpp
inference-engine/src/legacy_api/include/layer_transform.hpp
inference-engine/src/legacy_api/src/cnn_network_int8_normalizer.cpp
inference-engine/src/legacy_api/src/convert_function_to_cnn_network.cpp [moved from inference-engine/src/inference_engine/convert_function_to_cnn_network.cpp with 59% similarity]
inference-engine/src/legacy_api/src/graph_transformer.cpp
inference-engine/src/legacy_api/src/ie_cnn_layer_builder_ngraph.cpp [moved from inference-engine/src/inference_engine/ie_cnn_layer_builder_ngraph.cpp with 90% similarity]
inference-engine/src/legacy_api/src/ie_cnn_layer_builder_ngraph.h [moved from inference-engine/src/inference_engine/ie_cnn_layer_builder_ngraph.h with 96% similarity]
inference-engine/src/legacy_api/src/ie_layer_validators.cpp
inference-engine/src/legacy_api/src/ie_layer_validators.hpp
inference-engine/src/legacy_api/src/ie_layers.cpp
inference-engine/src/legacy_api/src/ie_util_internal.cpp
inference-engine/src/legacy_api/src/layer_transform.cpp
inference-engine/src/legacy_api/src/network_serializer.cpp
inference-engine/src/legacy_api/src/shape_infer/built-in/ie_built_in_holder.cpp
inference-engine/src/legacy_api/src/shape_infer/built-in/ie_scatter_shape_infer.hpp
inference-engine/src/low_precision_transformations/CMakeLists.txt
inference-engine/src/mkldnn_plugin/bf16transformer.cpp [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/bf16transformer.h [new file with mode: 0644]
inference-engine/src/mkldnn_plugin/config.cpp
inference-engine/src/mkldnn_plugin/config.h
inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
inference-engine/src/mkldnn_plugin/nodes/base.hpp
inference-engine/src/mkldnn_plugin/nodes/interp.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
inference-engine/src/mkldnn_plugin/nodes/normalize.cpp
inference-engine/src/mkldnn_plugin/nodes/topk.cpp
inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp
inference-engine/src/plugin_api/blob_factory.hpp
inference-engine/src/plugin_api/cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp
inference-engine/src/plugin_api/ie_system_conf.h
inference-engine/src/plugin_api/threading/ie_cpu_streams_executor.hpp
inference-engine/src/plugin_api/threading/ie_executor_manager.hpp
inference-engine/src/plugin_api/threading/ie_istreams_executor.hpp
inference-engine/src/plugin_api/threading/ie_thread_affinity.hpp
inference-engine/src/plugin_api/threading/ie_thread_local.hpp
inference-engine/src/preprocessing/CMakeLists.txt
inference-engine/src/preprocessing/cpu_x86_sse42/ie_preprocess_gapi_kernels_sse42.cpp
inference-engine/src/preprocessing/ie_preprocess_gapi_kernels.cpp
inference-engine/src/preprocessing/ie_preprocess_gapi_kernels_simd_impl.hpp
inference-engine/src/transformations/CMakeLists.txt
inference-engine/src/transformations/src/transformations/pull_transpose_through_fq.cpp
inference-engine/src/transformations/src/transformations/utils/utils.cpp
inference-engine/src/vpu/common/CMakeLists.txt
inference-engine/src/vpu/common/include/vpu/ngraph/operations/dynamic_shape_resolver.hpp [new file with mode: 0644]
inference-engine/src/vpu/common/include/vpu/ngraph/operations/static_shape_nonzero.hpp [new file with mode: 0644]
inference-engine/src/vpu/common/include/vpu/ngraph/transformations/dynamic_to_static_shape.hpp [new file with mode: 0644]
inference-engine/src/vpu/common/include/vpu/ngraph/transformations/dynamic_to_static_shape_nonzero.hpp [new file with mode: 0644]
inference-engine/src/vpu/common/include/vpu/utils/error.hpp
inference-engine/src/vpu/common/include/vpu/utils/ie_helpers.hpp
inference-engine/src/vpu/common/src/ngraph/operations/dynamic_shape_resolver.cpp [new file with mode: 0644]
inference-engine/src/vpu/common/src/ngraph/operations/static_shape_nonzero.cpp [new file with mode: 0644]
inference-engine/src/vpu/common/src/ngraph/transformations/dynamic_to_static_shape.cpp [new file with mode: 0644]
inference-engine/src/vpu/common/src/ngraph/transformations/dynamic_to_static_shape_nonzero.cpp [new file with mode: 0644]
inference-engine/src/vpu/common/src/utils/ie_helpers.cpp
inference-engine/src/vpu/custom_kernels/grn.cl
inference-engine/src/vpu/custom_kernels/mvn.cl
inference-engine/src/vpu/graph_transformer/include/vpu/backend/backend.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/frontend/frontend.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/graph_transformer.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/middleend/allocator/allocator.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/middleend/hw/tiling.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/middleend/hw/utility.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/middleend/sw/utility.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/model/data.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/batch_norm_contents.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/calculated_data_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/conv_weights_contents.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/data_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/deconvolution_contents.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/default_sw_weights_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/hw_const_data_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/hw_weights_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/ie_blob_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/kernel_binary_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/mean_contents.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/merge_fc_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/mtcnn_blob_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/prelu_blob_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/priorbox_contents.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/replicated_data_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/scaled_content.hpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/include/vpu/model/data_desc.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/model/stage.hpp
inference-engine/src/vpu/graph_transformer/include/vpu/stage_builder.hpp
inference-engine/src/vpu/graph_transformer/src/backend/dump_to_dot.cpp
inference-engine/src/vpu/graph_transformer/src/backend/serialize.cpp
inference-engine/src/vpu/graph_transformer/src/blob_reader.cpp
inference-engine/src/vpu/graph_transformer/src/frontend/detect_network_batch.cpp
inference-engine/src/vpu/graph_transformer/src/frontend/frontend.cpp
inference-engine/src/vpu/graph_transformer/src/frontend/parse_data.cpp
inference-engine/src/vpu/graph_transformer/src/frontend/pre_process.cpp
inference-engine/src/vpu/graph_transformer/src/frontend/remove_const_layers.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/allocator/allocator.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/hw/conv_tiling/hw_stage_tiler.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/hw/utility.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/adjust_data_batch.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/adjust_data_location.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/allocate_resources.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/eliminate_const_concat.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/final_check.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/hw_extra_split.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/hw_fc_tiling.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/merge_parallel_fc.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/replace_deconv_by_conv.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/replace_gemm_by_conv.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/replace_priorbox_with_const.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/replace_with_reduce_mean.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/split_conv3d_into_2d.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/split_grouped_conv.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/split_hw_conv_and_pool.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/split_hw_depth_convolution.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/split_pool3d_into_2d.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/sw_conv_adaptation.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/sw_deconv_adaptation.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/sw_fc_adaptation.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/passes/weights_analysis.cpp
inference-engine/src/vpu/graph_transformer/src/middleend/sw/utility.cpp
inference-engine/src/vpu/graph_transformer/src/model/data.cpp
inference-engine/src/vpu/graph_transformer/src/model/data_contents/batch_norm_contents.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/calculated_data_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/conv_weights_contents.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/data_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/deconvolution_contents.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/default_sw_weights_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/hw_const_data_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/hw_weights_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/ie_blob_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/kernel_binary_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/mean_contents.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/merge_fc_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/mtcnn_blob_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/prelu_blob_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/priorbox_contents.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/replicated_data_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/data_contents/scaled_content.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/model/model.cpp
inference-engine/src/vpu/graph_transformer/src/stages/batch_norm.cpp
inference-engine/src/vpu/graph_transformer/src/stages/custom.cpp
inference-engine/src/vpu/graph_transformer/src/stages/eltwise.cpp
inference-engine/src/vpu/graph_transformer/src/stages/exp_generateproposals.cpp
inference-engine/src/vpu/graph_transformer/src/stages/exp_priorgridgenerator.cpp
inference-engine/src/vpu/graph_transformer/src/stages/exp_topkrois.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/gather.cpp
inference-engine/src/vpu/graph_transformer/src/stages/mtcnn.cpp
inference-engine/src/vpu/graph_transformer/src/stages/mx_stage.cpp
inference-engine/src/vpu/graph_transformer/src/stages/nonzero.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/normalize.cpp
inference-engine/src/vpu/graph_transformer/src/stages/prelu.cpp
inference-engine/src/vpu/graph_transformer/src/stages/reduce.cpp
inference-engine/src/vpu/graph_transformer/src/stages/rnn.cpp
inference-engine/src/vpu/graph_transformer/src/stages/roi_align.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/scatter_update.cpp [new file with mode: 0644]
inference-engine/src/vpu/graph_transformer/src/stages/screlu.cpp
inference-engine/src/vpu/graph_transformer/src/stages/tensor_iterator.cpp
inference-engine/src/vpu/myriad_plugin/CMakeLists.txt
inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp
inference-engine/src/vpu/myriad_plugin/myriad_plugin.cpp
inference-engine/tests/functional/inference_engine/extension_lib/include/extension.hpp
inference-engine/tests/functional/inference_engine/extension_lib/src/extension.cpp
inference-engine/tests/functional/inference_engine/ie_extension_test.cpp
inference-engine/tests/functional/inference_engine/ie_irelease_test.cpp
inference-engine/tests/functional/inference_engine/net_reader_test.cpp
inference-engine/tests/functional/inference_engine/network_serializer_test.cpp
inference-engine/tests/functional/inference_engine/ngraph_reader/ngraph_reader_tests.hpp
inference-engine/tests/functional/inference_engine/ngraph_reshape_tests.cpp
inference-engine/tests/functional/plugin/cpu/bfloat16/bf16_network_restoring.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/bfloat16_helpers.hpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/conv_conv.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/conv_dwconv_relu.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/conv_relu_pool_conv_relu_pool.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/faster_100_5_1_1_conv.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/mobilenet_ssd_with_branching.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_eltwise_conv.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_eltwise_relu_conv.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_eltwise_scaleshift.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_elu_conv.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_relu.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_concat_relu.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_eltwise.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_mixed1_eltwise.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_mixed2_eltwise.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x3_eltwise.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x2_conv_x2_eltwise.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x3_conv_eltwise_relu.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/tail_fp32_optimization.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/bfloat16/topk_inputs_i32.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/configuration_tests/configuration_tests.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/execution_graph_tests/unique_node_names.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/ngraph_conversion_tests/plugin_specific_ngraph_conversion.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/activation.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/batch_to_space.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/concat.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/convolution.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/pooling.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reshape.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/softmax.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/space_to_batch.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/split.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
inference-engine/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/split_conv_concat.cpp
inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/activation.cpp
inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/concat.cpp
inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp
inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/pooling.cpp
inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/split.cpp
inference-engine/tests/functional/plugin/gna/shared_tests_instances/skip_tests_config.cpp
inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/split_conv_concat.cpp
inference-engine/tests/functional/plugin/gpu/CMakeLists.txt
inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/ngraph_conversion_tests/plugin_specific_ngraph_conversion.cpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/activation.cpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/concat.cpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/convolution.cpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reshape.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/split.cpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/strided_slice.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/skip_tests_config.cpp
inference-engine/tests/functional/plugin/gpu/shared_tests_instances/subgraph_tests/split_conv_concat.cpp
inference-engine/tests/functional/plugin/myriad/CMakeLists.txt
inference-engine/tests/functional/plugin/myriad/ngraph/conversions/dynamic_shape_resolver.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/myriad/ngraph/operations/dynamic_shape_resolver.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/myriad/ngraph/operations/static_shape_nonzero.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/myriad/ngraph/transformations/dynamic_to_static_shape_nonzero.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/myriad/ngraph/utils/ngraph_utils.h [new file with mode: 0644]
inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/activation.cpp
inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/concat.cpp
inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/convolution.cpp
inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/nonzero.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/pooling.cpp
inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/split.cpp
inference-engine/tests/functional/plugin/myriad/shared_tests_instances/skip_tests_config.cpp
inference-engine/tests/functional/plugin/myriad/shared_tests_instances/subgraph_tests/split_conv_concat.cpp
inference-engine/tests/functional/plugin/shared/include/configuration_tests/configuration_tests.hpp
inference-engine/tests/functional/plugin/shared/include/execution_graph_tests/unique_node_names.hpp
inference-engine/tests/functional/plugin/shared/include/ngraph_conversion_tests/plugin_specific_ngraph_conversion.hpp
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/activation.hpp
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/batch_to_space.hpp
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/concat.hpp
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/convolution.hpp
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/nonzero.hpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/pooling.hpp
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/reshape.hpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/softmax.hpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/space_to_batch.hpp
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/split.hpp
inference-engine/tests/functional/plugin/shared/include/single_layer_tests/strided_slice.hpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/include/subgraph_tests/split_conv_concat.hpp
inference-engine/tests/functional/plugin/shared/src/configuration_tests/configuration_tests.cpp
inference-engine/tests/functional/plugin/shared/src/execution_graph_tests/unique_node_names.cpp
inference-engine/tests/functional/plugin/shared/src/ngraph_conversion_tests/plugin_specific_ngraph_conversion.cpp
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/activation.cpp
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/batch_to_space.cpp
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/concat.cpp
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/convolution.cpp
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/nonzero.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/pooling.cpp
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/reshape.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/softmax.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/space_to_batch.cpp
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/split.cpp
inference-engine/tests/functional/plugin/shared/src/single_layer_tests/strided_slice.cpp [new file with mode: 0644]
inference-engine/tests/functional/plugin/shared/src/subgraph_tests/split_conv_concat.cpp
inference-engine/tests/ie_test_utils/common_test_utils/common_layers_params.cpp
inference-engine/tests/ie_test_utils/common_test_utils/common_layers_params.hpp
inference-engine/tests/ie_test_utils/common_test_utils/common_utils.hpp
inference-engine/tests/ie_test_utils/common_test_utils/data_utils.hpp
inference-engine/tests/ie_test_utils/common_test_utils/file_utils.hpp
inference-engine/tests/ie_test_utils/common_test_utils/test_common.cpp
inference-engine/tests/ie_test_utils/common_test_utils/test_common.hpp
inference-engine/tests/ie_test_utils/common_test_utils/test_constants.hpp
inference-engine/tests/ie_test_utils/common_test_utils/unicode_utils.hpp
inference-engine/tests/ie_test_utils/common_test_utils/xml_net_builder/ir_net.cpp
inference-engine/tests/ie_test_utils/common_test_utils/xml_net_builder/xml_father.hpp
inference-engine/tests/ie_test_utils/common_test_utils/xml_net_builder/xml_filler.hpp
inference-engine/tests/ie_test_utils/functional_test_utils/CMakeLists.txt
inference-engine/tests/ie_test_utils/functional_test_utils/blob_utils.hpp
inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp [new file with mode: 0644]
inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.hpp
inference-engine/tests/ie_test_utils/functional_test_utils/network_utils.cpp
inference-engine/tests/ie_test_utils/functional_test_utils/network_utils.hpp
inference-engine/tests/ie_test_utils/functional_test_utils/precision_utils.hpp
inference-engine/tests/ie_test_utils/functional_test_utils/skip_tests_config.hpp
inference-engine/tests/ie_test_utils/functional_test_utils/test_model/test_model.cpp
inference-engine/tests/ie_test_utils/functional_test_utils/test_model/test_model.hpp
inference-engine/tests/ie_test_utils/unit_test_utils/CMakeLists.txt
inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_ie_imemory_state.hpp
inference-engine/tests/ie_test_utils/unit_test_utils/mocks/mock_not_empty_icnn_network.hpp
inference-engine/tests/ngraph_functions/include/ngraph_functions/builders.hpp
inference-engine/tests/ngraph_functions/include/ngraph_functions/pass/convert_prc.hpp
inference-engine/tests/ngraph_functions/include/ngraph_functions/subgraph_builders.hpp [new file with mode: 0644]
inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/data_utils.hpp
inference-engine/tests/ngraph_functions/include/ngraph_functions/utils/ngraph_helpers.hpp
inference-engine/tests/ngraph_functions/src/activation.cpp
inference-engine/tests/ngraph_functions/src/batch_to_space.cpp
inference-engine/tests/ngraph_functions/src/constant.cpp
inference-engine/tests/ngraph_functions/src/convolution.cpp
inference-engine/tests/ngraph_functions/src/params_vector.cpp
inference-engine/tests/ngraph_functions/src/space_to_batch.cpp
inference-engine/tests/ngraph_functions/src/split.cpp
inference-engine/tests/ngraph_functions/src/strided_slice.cpp [new file with mode: 0644]
inference-engine/tests/ngraph_functions/src/utils/ngraph_helpers.cpp
inference-engine/tests/unit/CMakeLists.txt
inference-engine/tests/unit/gna/gna_api_stub.cpp
inference-engine/tests/unit/gna/gna_mock_api.hpp
inference-engine/tests/unit/gna/gna_plugin_config_test.cpp [new file with mode: 0644]
inference-engine/tests/unit/inference_engine/ie_exception_test.cpp
inference-engine/tests/unit/inference_engine/system_allocator_test.cpp
inference-engine/tests/unit/vpu/heap_test.cpp
inference-engine/tests_deprecated/helpers/CMakeLists.txt
inference-engine/tests_deprecated/helpers/single_layer_common.cpp
inference-engine/tests_deprecated/helpers/single_layer_common.hpp
inference-engine/tests_deprecated/helpers/test_model_path.cpp [moved from inference-engine/tests_deprecated/helpers/test_models_path.cpp with 90% similarity]
inference-engine/tests_deprecated/helpers/test_model_repo.hpp [new file with mode: 0644]
inference-engine/tests_deprecated/helpers/test_model_repo.hpp.in [deleted file]
inference-engine/tests_deprecated/helpers/tests_common.cpp
inference-engine/tests_deprecated/helpers/tests_common.hpp
inference-engine/tests_deprecated/unit/CMakeLists.txt
inference-engine/tests_deprecated/unit/cnn_network/v2_format_parser_test.cpp
inference-engine/tests_deprecated/unit/engines/gna/gna_matcher.cpp
inference-engine/tests_deprecated/unit/engines/gna/gna_matcher.hpp
inference-engine/tests_deprecated/unit/engines/gna/layers/gna_align_filter2_tests.cpp [new file with mode: 0644]
inference-engine/tests_deprecated/unit/engines/gna/matchers/copy_matcher.hpp
inference-engine/tests_deprecated/unit/engines/gna/matchers/weights_matcher.hpp
inference-engine/tests_deprecated/unit/engines/mkldnn/dump_test.cpp
inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/extensions/math_tests.cpp
inference-engine/tests_deprecated/unit/engines/mkldnn/graph/layers/extensions/scatter_tests.cpp
inference-engine/tests_deprecated/unit/engines/vpu/adjust_data_location_tests.cpp
inference-engine/tests_deprecated/unit/engines/vpu/eliminate_const_concat_tests.cpp
inference-engine/tests_deprecated/unit/engines/vpu/merge_parallel_fc.cpp
inference-engine/tests_deprecated/unit/engines/vpu/replace_with_screlu_tests.cpp
inference-engine/tests_deprecated/unit/inference_engine_tests/cnn_ngraph_impl_tests.cpp
inference-engine/tests_deprecated/unit/inference_engine_tests/convert_ngraph_to_cnn_network_tests.cpp
inference-engine/tests_deprecated/unit/inference_engine_tests/network_serializer_tests.cpp
inference-engine/tests_deprecated/unit/inference_engine_tests/pointer_test.cpp [deleted file]
inference-engine/tests_deprecated/unit/topology_verification_tests/v2_topology_verification_test.cpp
inference-engine/thirdparty/clDNN/api/layout.hpp
inference-engine/thirdparty/clDNN/api/tensor.hpp
inference-engine/thirdparty/clDNN/api_extension/fused_conv_eltwise.hpp
inference-engine/thirdparty/clDNN/common/khronos_ocl_clhpp/cl2_ext.hpp
inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.cpp
inference-engine/thirdparty/clDNN/kernel_selector/common/tensor_type.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/concatenation/concatenation_kernel_simple_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.h [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3.h [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.h [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_base.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_iyxo.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_iyxo.h [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_imad.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_block2_opt.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_block2_opt.h [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_ref.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fully_connected/fully_connected_kernel_imad.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_base.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_iyxo.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_iyxo.h [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_selector.h
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.hpp [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_selector.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/resample/resample_kernel_ref.h
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/concatenation_gpu_simple_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_3x3.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_3x3_ks.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_iyxo.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_to_fs_byx_fsv32.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad_b_fs_yx_fsv32_dw.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/depth_to_space_block2_opt.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_iyxo.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_imad.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/data_types.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/fetch.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/include/mmad.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad_accumulate.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad_reduce.cl [new file with mode: 0644]
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_data.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/reorder_weights.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/resample_ref.cl
inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/common/primitive_db_gen.py
inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_base.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_common.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.cpp
inference-engine/thirdparty/clDNN/kernel_selector/core/kernel_selector_params.h
inference-engine/thirdparty/clDNN/src/error_handler.cpp
inference-engine/thirdparty/clDNN/src/fused_conv_eltwise.cpp
inference-engine/thirdparty/clDNN/src/gpu/concatenation_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/convolution_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/eltwise_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/fully_connected_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/fused_conv_eltwise_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/memory_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/mvn_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/scale_gpu.cpp
inference-engine/thirdparty/clDNN/src/gpu/strided_slice_gpu.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/graph_initializations.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/pre_replace_deconv.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_buffer_fusing.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_quantization.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/remove_redundant_reorders.cpp
inference-engine/thirdparty/clDNN/src/graph_optimizer/reorder_inputs.cpp
inference-engine/thirdparty/clDNN/src/include/depth_to_space_inst.h
inference-engine/thirdparty/clDNN/src/include/pass_manager.h
inference-engine/thirdparty/clDNN/src/include/program_helpers.h
inference-engine/thirdparty/clDNN/src/include/to_string_utils.h
inference-engine/thirdparty/clDNN/src/kernel_selector_helper.cpp
inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp
inference-engine/thirdparty/clDNN/src/program.cpp
inference-engine/thirdparty/clDNN/src/program_helpers.cpp
inference-engine/thirdparty/clDNN/src/reorder.cpp
inference-engine/thirdparty/clDNN/src/strided_slice.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/concatenation_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/convolution_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/deconvolution_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/depth_to_space_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/eltwise_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/fused_conv_eltwise_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/mvn_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/reorder_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/resample_gpu_test.cpp
inference-engine/thirdparty/clDNN/tests/test_cases/strided_slice_gpu_test.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/cpu_engine.cpp
inference-engine/thirdparty/mkl-dnn/src/cpu/jit_avx512_core_bf16_convolution.cpp
inference-engine/thirdparty/movidius/XLink/pc/Win/include/win_synchapi.h [new file with mode: 0644]
inference-engine/thirdparty/movidius/XLink/pc/Win/src/win_synchapi.c [new file with mode: 0644]
inference-engine/thirdparty/movidius/mvnc/include/watchdog/watchdogPrivate.hpp
inference-engine/thirdparty/movidius/mvnc/src/watchdog/watchdog.cpp
inference-engine/tools/benchmark_tool/README.md
inference-engine/tools/compile_tool/CMakeLists.txt
inference-engine/tools/vpu/vpu_compile/CMakeLists.txt
inference-engine/tools/vpu/vpu_perfcheck/CMakeLists.txt
model-optimizer/automation/create_package.py [new file with mode: 0644]
model-optimizer/automation/package_BOM.txt [new file with mode: 0644]
model-optimizer/automation/utils.py [new file with mode: 0644]
model-optimizer/extensions/analysis/boolean_input.py
model-optimizer/extensions/analysis/json_print.py
model-optimizer/extensions/analysis/tf_od_api.py
model-optimizer/extensions/back/CutMemory_test.py
model-optimizer/extensions/back/I64ToI32.py
model-optimizer/extensions/back/LSTMCellNormalizer.py
model-optimizer/extensions/back/OptimizeTransposeReshapeSequence.py
model-optimizer/extensions/back/ProposalMutation.py
model-optimizer/extensions/back/ReduceToPooling_test.py
model-optimizer/extensions/back/ReduceTransposeDimensions.py
model-optimizer/extensions/back/Reshape0DToSqueeze.py
model-optimizer/extensions/back/ShapeOfToShape.py
model-optimizer/extensions/back/ShufflenetReLUReorder_test.py
model-optimizer/extensions/back/SpecialNodesFinalization.py
model-optimizer/extensions/back/SpecialNodesFinalization_test.py
model-optimizer/extensions/back/StridedSliceMasksNormalizer.py
model-optimizer/extensions/back/TileNormalizer_test.py
model-optimizer/extensions/back/compress_quantized_weights_test.py
model-optimizer/extensions/back/disable_unsupported_ND_operations.py
model-optimizer/extensions/back/insert_compatibility_l2normalization.py
model-optimizer/extensions/front/ChangeCastOutputType.py
model-optimizer/extensions/front/ExpandDimsToUnsqueeze.py
model-optimizer/extensions/front/GeLUMerger_Erf.py
model-optimizer/extensions/front/LRNReplacer.py
model-optimizer/extensions/front/LRNReplacer_test.py
model-optimizer/extensions/front/Log1p_test.py
model-optimizer/extensions/front/LogSoftmax.py [moved from model-optimizer/extensions/front/tf/LogSoftmax.py with 69% similarity]
model-optimizer/extensions/front/LogSoftmax_test.py [new file with mode: 0644]
model-optimizer/extensions/front/MatMul_normalizer.py
model-optimizer/extensions/front/Pack_test.py
model-optimizer/extensions/front/SqueezeNormalize.py
model-optimizer/extensions/front/TopKNormalize.py
model-optimizer/extensions/front/binary_quantize_normalization.py
model-optimizer/extensions/front/binary_quantize_normalization_test.py
model-optimizer/extensions/front/caffe/accum_ext_test.py
model-optimizer/extensions/front/caffe/argmax_ext_test.py
model-optimizer/extensions/front/caffe/axpy.py
model-optimizer/extensions/front/caffe/bn_test.py
model-optimizer/extensions/front/caffe/correlation_ext_test.py
model-optimizer/extensions/front/caffe/ctcgreedydecoder_ext_test.py
model-optimizer/extensions/front/caffe/data_augmentation_ext_test.py
model-optimizer/extensions/front/caffe/elu.py
model-optimizer/extensions/front/caffe/elu_test.py
model-optimizer/extensions/front/caffe/grn_ext_test.py
model-optimizer/extensions/front/caffe/normalize_ext_test.py
model-optimizer/extensions/front/caffe/power_file_ext_test.py
model-optimizer/extensions/front/caffe/prelu_ext_test.py
model-optimizer/extensions/front/caffe/priorbox_clustered_ext_test.py
model-optimizer/extensions/front/caffe/priorbox_ext_test.py
model-optimizer/extensions/front/caffe/proposal_ext_test.py
model-optimizer/extensions/front/caffe/proposal_python_ext_test.py
model-optimizer/extensions/front/caffe/regionyolo_ext_test.py
model-optimizer/extensions/front/caffe/relu6.py
model-optimizer/extensions/front/caffe/reorgyolo_ext_test.py
model-optimizer/extensions/front/caffe/simplernms_ext_test.py
model-optimizer/extensions/front/caffe/spatial_transformer_ext_test.py
model-optimizer/extensions/front/caffe/tanh.py
model-optimizer/extensions/front/eltwise_n.py
model-optimizer/extensions/front/eltwise_n_test.py
model-optimizer/extensions/front/global_pooling_to_reduce.py
model-optimizer/extensions/front/image_scaler.py
model-optimizer/extensions/front/image_scaler_test.py
model-optimizer/extensions/front/instance_normalization.py
model-optimizer/extensions/front/instance_normalization_test.py
model-optimizer/extensions/front/kaldi/apply_counts_test.py
model-optimizer/extensions/front/kaldi/logsoftmax.py [deleted file]
model-optimizer/extensions/front/kaldi/logsoftmax_component_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/kaldi/replace_eltwise_nin1.py
model-optimizer/extensions/front/kaldi/sigmoid_ext_test.py
model-optimizer/extensions/front/kaldi/tanh_ext_test.py
model-optimizer/extensions/front/mxnet/add_input_data_to_prior_boxes_test.py
model-optimizer/extensions/front/mxnet/check_softmax_node_inputs_test.py
model-optimizer/extensions/front/mxnet/conv_ext.py
model-optimizer/extensions/front/mxnet/custom_test.py
model-optimizer/extensions/front/mxnet/elementwise_ext.py
model-optimizer/extensions/front/mxnet/gather_test.py
model-optimizer/extensions/front/mxnet/instance_norm_ext.py
model-optimizer/extensions/front/mxnet/multibox_detection_ext.py
model-optimizer/extensions/front/mxnet/mx_reshape_reverse.py
model-optimizer/extensions/front/mxnet/null_ext.py
model-optimizer/extensions/front/mxnet/pad_ext.py
model-optimizer/extensions/front/mxnet/slice_like_ext.py
model-optimizer/extensions/front/mxnet/softmax.py
model-optimizer/extensions/front/mxnet/softmax_activation_ext.py
model-optimizer/extensions/front/mxnet/softmax_ext.py
model-optimizer/extensions/front/mxnet/softmax_output_ext.py
model-optimizer/extensions/front/mxnet/squeeze_ext.py
model-optimizer/extensions/front/mxnet/ssd_detection_output_replacer.py
model-optimizer/extensions/front/mxnet/ssd_pattern_flatten_softmax_activation_test.py
model-optimizer/extensions/front/mxnet/ssd_pattern_remove_flatten_test.py
model-optimizer/extensions/front/mxnet/ssd_pattern_remove_reshape_test.py
model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose.py
model-optimizer/extensions/front/mxnet/ssd_pattern_remove_transpose_test.py
model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs.py
model-optimizer/extensions/front/mxnet/ssd_reorder_detection_out_inputs_test.py
model-optimizer/extensions/front/mxnet/stack_ext.py
model-optimizer/extensions/front/mxnet/zeros_ext.py
model-optimizer/extensions/front/no_op_eraser.py
model-optimizer/extensions/front/onnx/affine_ext_test.py
model-optimizer/extensions/front/onnx/constant_fill_ext.py
model-optimizer/extensions/front/onnx/conv_ext.py
model-optimizer/extensions/front/onnx/conv_ext_test.py
model-optimizer/extensions/front/onnx/crop_ext_test.py
model-optimizer/extensions/front/onnx/deformable_conv_ext.py
model-optimizer/extensions/front/onnx/detection_output_test.py
model-optimizer/extensions/front/onnx/detectionoutput_ext.py
model-optimizer/extensions/front/onnx/dropout_ext.py
model-optimizer/extensions/front/onnx/image_scaler_ext.py
model-optimizer/extensions/front/onnx/mask_rcnn_conversion.py
model-optimizer/extensions/front/onnx/non_zero_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/onnx/person_detection_crossroad.json [new file with mode: 0644]
model-optimizer/extensions/front/onnx/person_detection_crossroad_conversion.py [new file with mode: 0644]
model-optimizer/extensions/front/onnx/pooling_ext.py
model-optimizer/extensions/front/onnx/priorbox_clustered_ext_test.py
model-optimizer/extensions/front/onnx/priorbox_ext_test.py
model-optimizer/extensions/front/onnx/quantize_ext.py
model-optimizer/extensions/front/onnx/reverse_sequence_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/onnx/roialign_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/onnx/scatter_ext.py [new file with mode: 0644]
model-optimizer/extensions/front/onnx/softmax_ext.py
model-optimizer/extensions/front/onnx/squeeze_ext.py
model-optimizer/extensions/front/onnx/transpose_ext_test.py
model-optimizer/extensions/front/reciprocal_test.py
model-optimizer/extensions/front/scatter_normalizer.py [new file with mode: 0644]
model-optimizer/extensions/front/scatter_normalizer_test.py [new file with mode: 0644]
model-optimizer/extensions/front/softsign_replacer_test.py
model-optimizer/extensions/front/squared_difference_test.py
model-optimizer/extensions/front/standalone_const_eraser.py
model-optimizer/extensions/front/tf/BlockLSTM.py
model-optimizer/extensions/front/tf/LoopCond_ext.py
model-optimizer/extensions/front/tf/SSDToolboxDetectionOutput.py
model-optimizer/extensions/front/tf/SplitConcatPairToInterpolate.py
model-optimizer/extensions/front/tf/SplitConcatPairToInterpolate_test.py
model-optimizer/extensions/front/tf/SwitchMergeOptimization.py
model-optimizer/extensions/front/tf/SwitchMergeOptimization_test.py
model-optimizer/extensions/front/tf/TensorArrayGatherV3.py
model-optimizer/extensions/front/tf/assign_elimination.py
model-optimizer/extensions/front/tf/bucketize.py
model-optimizer/extensions/front/tf/conv_ext.py
model-optimizer/extensions/front/tf/deconv_ext.py
model-optimizer/extensions/front/tf/fake_const_ext.py
model-optimizer/extensions/front/tf/mvn_unrolled_test.py
model-optimizer/extensions/front/tf/pooling_ext.py
model-optimizer/extensions/front/tf/softmax_ext.py
model-optimizer/extensions/front/tf/space_to_batch.py
model-optimizer/extensions/front/tf/space_to_depth_ext.py
model-optimizer/extensions/front/tf/sparse_fill_empty_rows_ext.py
model-optimizer/extensions/front/tf/sparse_segment_mean_ext.py
model-optimizer/extensions/front/tf/sparse_segment_sqrtn_ext.py
model-optimizer/extensions/front/tf/sparse_segment_sum_ext.py
model-optimizer/extensions/front/tf/sparse_to_dense_ext.py
model-optimizer/extensions/front/tf/sparse_weighted_sum.py
model-optimizer/extensions/front/tf/sparse_weighted_sum_test.py
model-optimizer/extensions/front/tf/swish_test.py
model-optimizer/extensions/front/tf/topk_ext.py
model-optimizer/extensions/front/tf/unique_ext.py
model-optimizer/extensions/middle/ApplyPermutations.py
model-optimizer/extensions/middle/BiasAddBroadcasting.py
model-optimizer/extensions/middle/ConstSwitchResolver.py
model-optimizer/extensions/middle/ConvertGroupedStridedSlice_test.py
model-optimizer/extensions/middle/CutInputHavingZeroDimFromConcat_test.py
model-optimizer/extensions/middle/Deconvolution3rdInputNormalization.py
model-optimizer/extensions/middle/GRURNNSequenceToTensorIterator.py
model-optimizer/extensions/middle/GroupNorm.py
model-optimizer/extensions/middle/InsertLayoutPropagationTransposes.py
model-optimizer/extensions/middle/InsertSelect_test.py
model-optimizer/extensions/middle/InterpolateSequenceToInterpolate.py
model-optimizer/extensions/middle/InterpolateSequenceToInterpolate_test.py
model-optimizer/extensions/middle/L2NormToNorm_test.py
model-optimizer/extensions/middle/LSTMRNNSequenceToTensorIterator.py
model-optimizer/extensions/middle/MXNetSplitMultiLayers.py
model-optimizer/extensions/middle/MXTileReplacer.py
model-optimizer/extensions/middle/MXTileReplacer_test.py [new file with mode: 0644]
model-optimizer/extensions/middle/MinumumMiddleReplacer_test.py
model-optimizer/extensions/middle/ReluQuantizeFuse.py
model-optimizer/extensions/middle/ReluQuantizeFuse_test.py
model-optimizer/extensions/middle/RemoveDuplicationMemory_test.py
model-optimizer/extensions/middle/RemoveUselessConcatSplit_test.py
model-optimizer/extensions/middle/RemoveUselessCrops_test.py
model-optimizer/extensions/middle/ReplaceMemoryOffsetWithSplice_test.py
model-optimizer/extensions/middle/ReplacePNormNodePattern_test.py
model-optimizer/extensions/middle/ReplaceSpliceNodePattern_test.py
model-optimizer/extensions/middle/ScaleInput_test.py
model-optimizer/extensions/middle/SharedWeightsDuplication_test.py
model-optimizer/extensions/middle/SliceConverter.py
model-optimizer/extensions/middle/TensorIteratorBackEdge.py
model-optimizer/extensions/middle/TensorIteratorBackEdge_test.py
model-optimizer/extensions/middle/TensorIteratorCondition_test.py
model-optimizer/extensions/middle/TensorIteratorInput_test.py
model-optimizer/extensions/middle/TensorIteratorLSTMToLSTMSequence.py
model-optimizer/extensions/middle/TensorIteratorMerge.py
model-optimizer/extensions/middle/TensorIteratorOutput_test.py
model-optimizer/extensions/middle/quantize_fuses_test.py
model-optimizer/extensions/middle/sparse_reshape.py
model-optimizer/extensions/middle/sparse_reshape_test.py
model-optimizer/extensions/middle/weights_permute_normalizer_test.py
model-optimizer/extensions/ops/BlockLSTM.py
model-optimizer/extensions/ops/Enter.py
model-optimizer/extensions/ops/Exit.py
model-optimizer/extensions/ops/GRU.py
model-optimizer/extensions/ops/MatMul_value_propagation_test.py
model-optimizer/extensions/ops/TensorArrayGather.py
model-optimizer/extensions/ops/TensorArrayRead.py
model-optimizer/extensions/ops/TensorIterator_ops.py
model-optimizer/extensions/ops/adaptive_avg_pooling.py
model-optimizer/extensions/ops/argmax.py
model-optimizer/extensions/ops/assert_op.py
model-optimizer/extensions/ops/aten.py
model-optimizer/extensions/ops/bucketize.py
model-optimizer/extensions/ops/bucketize_test.py
model-optimizer/extensions/ops/data_augmentation.py
model-optimizer/extensions/ops/depth_to_space.py
model-optimizer/extensions/ops/depth_to_space_test.py
model-optimizer/extensions/ops/div_value_propagation_test.py
model-optimizer/extensions/ops/exp.py
model-optimizer/extensions/ops/grn.py
model-optimizer/extensions/ops/instance_normalization.py
model-optimizer/extensions/ops/instance_normalization_test.py
model-optimizer/extensions/ops/lstm_cell.py
model-optimizer/extensions/ops/merge_test.py
model-optimizer/extensions/ops/mxslice.py
model-optimizer/extensions/ops/non_max_suppression.py
model-optimizer/extensions/ops/non_zero.py
model-optimizer/extensions/ops/normalize.py
model-optimizer/extensions/ops/pack.py
model-optimizer/extensions/ops/power_file.py
model-optimizer/extensions/ops/prediction_heatmap.py
model-optimizer/extensions/ops/prelu.py
model-optimizer/extensions/ops/proposal_python_example.py
model-optimizer/extensions/ops/reorgyolo.py
model-optimizer/extensions/ops/roialign.py [new file with mode: 0644]
model-optimizer/extensions/ops/scatter.py [new file with mode: 0644]
model-optimizer/extensions/ops/select_test.py
model-optimizer/extensions/ops/simplernms.py
model-optimizer/extensions/ops/space_to_depth.py
model-optimizer/extensions/ops/space_to_depth_test.py [new file with mode: 0644]
model-optimizer/extensions/ops/sparse_fill_empty_rows.py
model-optimizer/extensions/ops/sparse_fill_empty_rows_test.py
model-optimizer/extensions/ops/sparse_reshape_test.py
model-optimizer/extensions/ops/sparse_segment_mean.py
model-optimizer/extensions/ops/sparse_segment_sqrtn.py
model-optimizer/extensions/ops/sparse_segment_sum.py
model-optimizer/extensions/ops/sparse_to_dense_test.py
model-optimizer/extensions/ops/sparse_weighted_sum.py
model-optimizer/extensions/ops/sparse_weighted_sum_test.py
model-optimizer/extensions/ops/split.py
model-optimizer/extensions/ops/split_test.py
model-optimizer/extensions/ops/stop_gradient.py
model-optimizer/extensions/ops/switch_test.py
model-optimizer/extensions/ops/transpose_test.py
model-optimizer/extensions/ops/unique.py
model-optimizer/extensions/ops/unique_test.py
model-optimizer/mo/back/ie_ir_ver_2/emitter.py
model-optimizer/mo/front/caffe/custom_layers_mapping.py
model-optimizer/mo/front/common/layout.py
model-optimizer/mo/front/common/partial_infer/caffe_fallback.py
model-optimizer/mo/front/common/partial_infer/concat.py
model-optimizer/mo/front/common/partial_infer/roipooling.py
model-optimizer/mo/front/common/partial_infer/utils.py
model-optimizer/mo/front/common/replacement.py
model-optimizer/mo/front/extractor_test.py
model-optimizer/mo/front/kaldi/extractors/add_ext.py
model-optimizer/mo/front/kaldi/extractors/batchnorm_component_ext.py
model-optimizer/mo/front/kaldi/extractors/clip_ext.py
model-optimizer/mo/front/kaldi/extractors/copy_ext.py
model-optimizer/mo/front/kaldi/extractors/elementwise_component_ext.py
model-optimizer/mo/front/kaldi/extractors/linear_component_ext.py
model-optimizer/mo/front/kaldi/extractors/lstm_nonlinearity_ext.py
model-optimizer/mo/front/kaldi/extractors/max_pooling_ext.py
model-optimizer/mo/front/kaldi/extractors/normalize_component_ext.py
model-optimizer/mo/front/kaldi/extractors/normalize_component_ext_test.py
model-optimizer/mo/front/kaldi/extractors/pnorm_component_ext.py
model-optimizer/mo/front/kaldi/extractors/pnorm_component_ext_test.py
model-optimizer/mo/front/kaldi/extractors/rectified_linear_component_ext.py
model-optimizer/mo/front/kaldi/loader/loader.py
model-optimizer/mo/front/kaldi/loader/loader_test.py
model-optimizer/mo/front/kaldi/loader/utils.py
model-optimizer/mo/front/kaldi/utils.py
model-optimizer/mo/front/mxnet/extractors/multibox_prior.py
model-optimizer/mo/front/mxnet/extractors/utils.py
model-optimizer/mo/front/mxnet/loader.py
model-optimizer/mo/front/mxnet/nd_to_params.py
model-optimizer/mo/front/onnx/loader.py
model-optimizer/mo/front/tf/extractors/utils_test.py
model-optimizer/mo/graph/graph.py
model-optimizer/mo/graph/graph_test.py
model-optimizer/mo/main.py
model-optimizer/mo/middle/passes/conv_test.py
model-optimizer/mo/middle/passes/eliminate.py
model-optimizer/mo/middle/passes/eliminate_test.py
model-optimizer/mo/middle/passes/fusing/decomposition_test.py
model-optimizer/mo/middle/passes/fusing/fuse_linear_ops_test.py
model-optimizer/mo/middle/passes/fusing/fuse_linear_seq.py
model-optimizer/mo/middle/passes/fusing/fuse_linear_seq_test.py
model-optimizer/mo/middle/passes/fusing/resnet_optimization_test.py
model-optimizer/mo/middle/passes/infer.py
model-optimizer/mo/middle/passes/leaky_relu.py
model-optimizer/mo/middle/passes/mean_scale_values_test.py
model-optimizer/mo/middle/passes/tensor_names.py
model-optimizer/mo/ops/convolution.py
model-optimizer/mo/ops/shape.py
model-optimizer/mo/ops/softmax.py
model-optimizer/mo/ops/space_to_batch.py
model-optimizer/mo/ops/squeeze_test.py
model-optimizer/mo/ops/strided_slice.py
model-optimizer/mo/ops/strided_slice_test.py
model-optimizer/mo/ops/unsqueeze_test.py
model-optimizer/mo/utils/class_registration.py
model-optimizer/mo/utils/custom_replacement_config.py
model-optimizer/mo/utils/graph.py
model-optimizer/mo/utils/graph_test.py
model-optimizer/mo/utils/ir_reader/extender.py
model-optimizer/mo/utils/ir_reader/extenders/GRUCell_extender.py [new file with mode: 0644]
model-optimizer/mo/utils/ir_reader/extenders/LSTMCell_extender.py
model-optimizer/mo/utils/ir_reader/extenders/RNNCell_extender.py [new file with mode: 0644]
model-optimizer/mo/utils/ir_reader/extenders/binary_convolution_extender.py
model-optimizer/mo/utils/ir_reader/extenders/conv_extender.py
model-optimizer/mo/utils/ir_reader/extenders/deconvolution_extender.py
model-optimizer/mo/utils/ir_reader/extenders/deformable_convolution_extender.py
model-optimizer/mo/utils/ir_reader/extenders/experimental_extender.py
model-optimizer/mo/utils/ir_reader/extenders/fakequantize_extender.py
model-optimizer/mo/utils/ir_reader/extenders/interpolate_extender.py
model-optimizer/mo/utils/ir_reader/extenders/non_zero_extender.py [new file with mode: 0644]
model-optimizer/mo/utils/ir_reader/extenders/pad_extender.py
model-optimizer/mo/utils/ir_reader/extenders/parameter_extender.py
model-optimizer/mo/utils/ir_reader/extenders/pooling_extender.py
model-optimizer/mo/utils/ir_reader/extenders/priorbox_clustered_extender.py
model-optimizer/mo/utils/ir_reader/extenders/priorbox_extender.py
model-optimizer/mo/utils/ir_reader/extenders/reorg_yolo_extender.py
model-optimizer/mo/utils/ir_reader/extenders/strided_slice_extender.py
model-optimizer/mo/utils/ir_reader/extenders/tensoriterator_extender.py
model-optimizer/mo/utils/ir_reader/extenders/topk_extender.py
model-optimizer/mo/utils/ir_reader/extenders/variadic_split_extender.py
model-optimizer/mo/utils/ir_reader/layer_to_class.py
model-optimizer/mo/utils/pipeline_config.py
model-optimizer/mo/utils/replacement_pattern.py
model-optimizer/mo/utils/unittest/graph.py
model-optimizer/mo/utils/unsupported_ops.py
model-optimizer/mo/utils/utils_test.py
model-optimizer/mo/utils/versions_checker_test.py
model-optimizer/requirements_dev.txt
ngraph
scripts/demo/README.txt [new file with mode: 0644]
scripts/demo/car.png [new file with mode: 0644]
scripts/demo/car_1.bmp [new file with mode: 0644]
scripts/demo/demo_benchmark_app.bat [new file with mode: 0644]
scripts/demo/demo_benchmark_app.sh [new file with mode: 0644]
scripts/demo/demo_security_barrier_camera.bat [new file with mode: 0644]
scripts/demo/demo_security_barrier_camera.conf [new file with mode: 0644]
scripts/demo/demo_security_barrier_camera.sh [new file with mode: 0644]
scripts/demo/demo_squeezenet_download_convert_run.bat [new file with mode: 0644]
scripts/demo/demo_squeezenet_download_convert_run.sh [new file with mode: 0644]
scripts/demo/squeezenet1.1.labels [new file with mode: 0644]
scripts/demo/utils.sh [new file with mode: 0644]
scripts/install_dependencies/install_4_14_kernel.sh [new file with mode: 0644]
scripts/install_dependencies/install_GST_dependencies.sh [new file with mode: 0644]
scripts/install_dependencies/install_NCS_udev_rules.sh [new file with mode: 0644]
scripts/install_dependencies/install_NEO_OCL_driver.sh [new file with mode: 0644]
scripts/install_dependencies/install_guide.html [new file with mode: 0644]
scripts/install_dependencies/install_openvino_dependencies.sh [new file with mode: 0644]
scripts/setupvars/setupvars.bat [new file with mode: 0644]
scripts/setupvars/setupvars.sh [new file with mode: 0644]
scripts/utils/create_package.py [new file with mode: 0644]
scripts/utils/utils.py [new file with mode: 0644]
tests/stress_tests/.automation/env_config.xml [new file with mode: 0644]
tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_references_config.xml [new file with mode: 0644]
tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_test_config.xml [new file with mode: 0644]
tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_references_config.xml [new file with mode: 0644]
tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_test_config.xml [new file with mode: 0644]
tests/stress_tests/.automation/memleaks_tests/nightly_configs/desktop_test_config.xml [new file with mode: 0644]
tests/stress_tests/.automation/memleaks_tests/weekly_configs/desktop_test_config.xml [new file with mode: 0644]
tests/stress_tests/.automation/unittests/nightly_configs/desktop_test_config.xml [new file with mode: 0644]
tests/stress_tests/.automation/unittests/weekly_configs/desktop_test_config.xml [new file with mode: 0644]
tests/stress_tests/CMakeLists.txt [new file with mode: 0644]
tests/stress_tests/README.md [new file with mode: 0644]
tests/stress_tests/common/ie_pipelines/pipelines.cpp [new file with mode: 0644]
tests/stress_tests/common/ie_pipelines/pipelines.h [new file with mode: 0644]
tests/stress_tests/common/managers/task_manager.h [new file with mode: 0644]
tests/stress_tests/common/managers/thread_manager.h [new file with mode: 0644]
tests/stress_tests/common/tests_utils.cpp [new file with mode: 0644]
tests/stress_tests/common/tests_utils.h [new file with mode: 0644]
tests/stress_tests/common/utils.cpp [new file with mode: 0644]
tests/stress_tests/common/utils.h [new file with mode: 0644]
tests/stress_tests/memcheck_tests/CMakeLists.txt [new file with mode: 0644]
tests/stress_tests/memcheck_tests/flags.h [new file with mode: 0644]
tests/stress_tests/memcheck_tests/local_configs/env_config.xml [new file with mode: 0644]
tests/stress_tests/memcheck_tests/local_configs/references_config.xml [new file with mode: 0644]
tests/stress_tests/memcheck_tests/local_configs/test_config.xml [new file with mode: 0644]
tests/stress_tests/memcheck_tests/main.cpp [new file with mode: 0644]
tests/stress_tests/memcheck_tests/tests.cpp [new file with mode: 0644]
tests/stress_tests/memcheck_tests/tests_pipelines/tests_pipelines.cpp [new file with mode: 0644]
tests/stress_tests/memcheck_tests/tests_pipelines/tests_pipelines.h [new file with mode: 0644]
tests/stress_tests/memcheck_tests/tests_utils.h [new file with mode: 0644]
tests/stress_tests/memleaks_tests/CMakeLists.txt [new file with mode: 0644]
tests/stress_tests/memleaks_tests/flags.h [new file with mode: 0644]
tests/stress_tests/memleaks_tests/local_configs/env_config.xml [new file with mode: 0644]
tests/stress_tests/memleaks_tests/local_configs/test_config.xml [new file with mode: 0644]
tests/stress_tests/memleaks_tests/main.cpp [new file with mode: 0644]
tests/stress_tests/memleaks_tests/tests.cpp [new file with mode: 0644]
tests/stress_tests/memleaks_tests/tests_pipelines/tests_pipelines.cpp [new file with mode: 0644]
tests/stress_tests/memleaks_tests/tests_pipelines/tests_pipelines.h [new file with mode: 0644]
tests/stress_tests/scripts/get_testdata.py [new file with mode: 0644]
tests/stress_tests/unittests/CMakeLists.txt [new file with mode: 0644]
tests/stress_tests/unittests/flags.h [new file with mode: 0644]
tests/stress_tests/unittests/local_configs/env_config.xml [new file with mode: 0644]
tests/stress_tests/unittests/local_configs/test_config.xml [new file with mode: 0644]
tests/stress_tests/unittests/main.cpp [new file with mode: 0644]
tests/stress_tests/unittests/tests.cpp [new file with mode: 0644]
tests/stress_tests/unittests/tests_pipelines/tests_pipelines.cpp [new file with mode: 0644]
tests/stress_tests/unittests/tests_pipelines/tests_pipelines.h [new file with mode: 0644]
tests/stress_tests/unittests/tests_pipelines/tests_pipelines_full_pipeline.cpp [new file with mode: 0644]
tools/benchmark/README.md
tools/benchmark/benchmark.py
tools/benchmark/main.py
tools/benchmark/parameters.py
tools/benchmark/utils/utils.py

index 31184fc..547ae62 100644 (file)
-## Ignore Visual Studio temporary files, build results, and
-## files generated by popular Visual Studio add-ons.
-
-# User-specific files
-*.suo
-*.user
-*.userosscache
-*.sln.docstates
-
-# User-specific files (MonoDevelop/Xamarin Studio)
-*.userprefs
-
-# Build results
-[Dd]ebug/
-[Dd]ebugPublic/
-[Rr]elease/
-[Rr]eleases/
-[Xx]64/
-[Xx]86/
-[Bb]uild/
-bld/
-[Bb]in/
-[Oo]bj/
-
-# PY.TEST
-*.pyc
-tests/integration/report.html
-tests/integration/report.xml
-tests/integration/assets/
-tests/integration/__pycache__/
-
-# Visual Studio 2015 cache/options directory
-.vs/
-# Uncomment if you have tasks that create the project's static files in wwwroot
-#wwwroot/
-
-# MSTest test Results
-[Tt]est[Rr]esult*/
-[Bb]uild[Ll]og.*
-
-# NUNIT
-*.VisualState.xml
-TestResult.xml
-
-# Build Results of an ATL Project
-[Dd]ebugPS/
-[Rr]eleasePS/
-dlldata.c
-
-# DNX
-project.lock.json
-artifacts/
-
-*_i.c
-*_p.c
-*_i.h
-*.ilk
-*.meta
-*.obj
-*.pch
-*.pdb
-*.pgc
-*.pgd
-*.rsp
-*.sbr
-*.tlb
-*.tli
-*.tlh
-*.tmp
-*.tmp_proj
-*.log
-*.vspscc
-*.vssscc
-.builds
-*.pidb
-*.svclog
-*.scc
-
-# Chutzpah Test files
-_Chutzpah*
-
-# Visual C++ cache files
-ipch/
-*.aps
-*.ncb
-*.opendb
-*.opensdf
-*.sdf
-*.cachefile
-*.VC.db
-
-# Visual Studio profiler
-*.psess
-*.vsp
-*.vspx
-*.sap
-
-# TFS 2012 Local Workspace
-$tf/
-
-# Guidance Automation Toolkit
-*.gpState
-
-# ReSharper is a .NET coding add-in
-_ReSharper*/
-*.[Rr]e[Ss]harper
-*.DotSettings.user
-
-# JustCode is a .NET coding add-in
-.JustCode
-
-# TeamCity is a build add-in
-_TeamCity*
-
-# DotCover is a Code Coverage Tool
-*.dotCover
-
-# NCrunch
-_NCrunch_*
-.*crunch*.local.xml
-nCrunchTemp_*
-
-# MightyMoose
-*.mm.*
-AutoTest.Net/
-
-# Web workbench (sass)
-.sass-cache/
-
-# Installshield output folder
-[Ee]xpress/
-
-# DocProject is a documentation generator add-in
-DocProject/buildhelp/
-DocProject/Help/*.HxT
-DocProject/Help/*.HxC
-DocProject/Help/*.hhc
-DocProject/Help/*.hhk
-DocProject/Help/*.hhp
-DocProject/Help/Html2
-DocProject/Help/html
-
-# Click-Once directory
-publish/
-
-# Publish Web Output
-*.[Pp]ublish.xml
-*.azurePubxml
-
-# TODO: Un-comment the next line if you do not want to checkin
-# your web deploy settings because they may include unencrypted
-# passwords
-#*.pubxml
-*.publishproj
-
-# NuGet Packages
-*.nupkg
-# The packages folder can be ignored because of Package Restore
-**/packages/*
-# except build/, which is used as an MSBuild target.
-!**/packages/build/
-# Uncomment if necessary however generally it will be regenerated when needed
-#!**/packages/repositories.config
-# NuGet v3's project.json files produces more ignoreable files
-*.nuget.props
-*.nuget.targets
-
-# Microsoft Azure Build Output
-csx/
-*.build.csdef
-
-# Microsoft Azure Emulator
-ecf/
-rcf/
-
-# Microsoft Azure ApplicationInsights config file
-ApplicationInsights.config
-
-# Windows Store app package directory
-AppPackages/
-BundleArtifacts/
-
-# Visual Studio cache files
-# files ending in .cache can be ignored
-*.[Cc]ache
-# but keep track of directories ending in .cache
-!*.[Cc]ache/
-
-# Others
-ClientBin/
-[Ss]tyle[Cc]op.*
-~$*
-*~
-*.dbmdl
-*.dbproj.schemaview
-*.pfx
-*.publishsettings
-node_modules/
-orleans.codegen.cs
-
-# RIA/Silverlight projects
-Generated_Code/
-
-# Backup & report files from converting an old project file
-# to a newer Visual Studio version. Backup files are not needed,
-# because we have git ;-)
-_UpgradeReport_Files/
-Backup*/
-UpgradeLog*.XML
-UpgradeLog*.htm
-
-# SQL Server files
-*.mdf
-*.ldf
-
-# Business Intelligence projects
-*.rdl.data
-*.bim.layout
-*.bim_*.settings
-
-# Microsoft Fakes
-FakesAssemblies/
-
-# GhostDoc plugin setting file
-*.GhostDoc.xml
-
-# Target VS files:
-vsx64
-
-# Node.js Tools for Visual Studio
-.ntvs_analysis.dat
-
-# Visual Studio 6 build log
-*.plg
-
-# Visual Studio 6 workspace options file
-*.opt
-
-# Visual Studio LightSwitch build output
-**/*.HTMLClient/GeneratedArtifacts
-**/*.DesktopClient/GeneratedArtifacts
-**/*.DesktopClient/ModelManifest.xml
-**/*.Server/GeneratedArtifacts
-**/*.Server/ModelManifest.xml
-_Pvt_Extensions
-
-# LightSwitch generated files
-GeneratedArtifacts/
-ModelManifest.xml
-
-# Paket dependency manager
-.paket/paket.exe
-
-# FAKE - F# Make
-.fake/
-*.filters
-/External
-/Output
-/InferenceEngineMain/models
-/Test
-/HTTPClient/*.a
-/InferenceEngineMain/newModels
+# build/artifact dirs
+_*
+# but ensure we don't skip __init__.py
+!__init__.py
+# developer tools
+.idea
+.vscode
+cmake-build-debug
+cmake-build-release
 .DS_Store
-
-# For IDEA
-.idea/
-VS/
-Xcode/
-temp/
-report/
-.kdev4/
-*.kdev4
-*.kate-swp
-
-/lin-build
-/win-build
-/CMakeFiles
-*.stamp
-*.depend
-*.vcxproj
-*.sln
-/CMakeCache.txt
-.vimprj/
-build_IA32/
-.dir-locals.el
-GTAGS
-GPATH
-GRTAGS
-GSYMS
+**/tags
 compile_commands.json
-service/dot-net-service/Output
-**/sublime_build
-/.project
-.vscode/
-/vsx32
-/service/dot-net-service/.klocwork/DotNetService
-cmake-build-*/
-/lin64
-
-.gdb_history
+bin/
+build/
 .local_vimrc
-.ycm_extra_conf.py
-tags
-
-
-# from Model Optimizer repo
-.idea
-.project
-.cproject
-.pydevproject
-.settings
-/bin/
-/gen/
-__pycache__
-*.swp
-/config.xml
-
-# Python-specific
-.env3
-*.pyc
-
-# Tests-specific
-.coverage
-htmlcov
-pylint_report.txt
-pylint_report_comments.txt
-
-# Documentation-generated
-docs/build
-docs/source/_static
-docs/source/_templates
-docs/source/generated/
-
-# Artifacts
-/*.bin
-/*.xml
-/*.json
-/*.so
-/*.txt
-/*.mapping
-/*.dat
-/*.svg
+.gdb_history
+.vimspector.json
+doc/
+docs/build_documentation/work_dir/
+inference-engine/plugins/
+.repo/
+docs/template_plugin/html/
+CMakeLists.txt.user
+docs/IE_PLUGIN_DG/html/
index 1aaf7fa..aaedb15 100644 (file)
@@ -5,4 +5,4 @@
 [submodule "ngraph"]
        path = ngraph
        url = https://github.com/NervanaSystems/ngraph.git
-       ignore = dirty
+       ignore = dirty
\ No newline at end of file
index edf8233..5c54d32 100644 (file)
@@ -77,13 +77,13 @@ function(build_ngraph)
 
     if (NOT ANDROID)
         ngraph_set(NGRAPH_UNIT_TEST_ENABLE TRUE)
-        ngraph_set(NGRAPH_UNIT_TEST_OPENVINO_ENABLE TRUE)
+        ngraph_set(NGRAPH_IE_ENABLE TRUE)
         # ngraph_set(NGRAPH_ONNX_IMPORT_ENABLE TRUE)
         set(NGRAPH_ONNX_IMPORT_ENABLE TRUE CACHE BOOL "" FORCE)
     else()
         ngraph_set(NGRAPH_UNIT_TEST_ENABLE FALSE)
         ngraph_set(NGRAPH_TEST_UTIL_ENABLE FALSE)
-        ngraph_set(NGRAPH_UNIT_TEST_OPENVINO_ENABLE FALSE)
+        ngraph_set(NGRAPH_IE_ENABLE FALSE)
         ngraph_set(NGRAPH_ONNX_IMPORT_ENABLE FALSE)
     endif()
 
diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644 (file)
index 0000000..c473d2b
--- /dev/null
@@ -0,0 +1,3 @@
+#!groovy
+
+dldtPipelineEntrypoint(this)
index bed7350..a074ecd 100644 (file)
@@ -37,8 +37,12 @@ function(ie_cpack_set_library_dir)
 
     if(WIN32)
         set(IE_CPACK_LIBRARY_PATH ${IE_CPACK_IE_DIR}/lib/${CMAKE_BUILD_TYPE}/${ARCH} PARENT_SCOPE)
+        set(IE_CPACK_RUNTIME_PATH ${IE_CPACK_IE_DIR}/bin/${CMAKE_BUILD_TYPE}/${ARCH} PARENT_SCOPE)
+        set(IE_CPACK_ARCHIVE_PATH ${IE_CPACK_IE_DIR}/lib/${CMAKE_BUILD_TYPE}/${ARCH} PARENT_SCOPE)
     else()
         set(IE_CPACK_LIBRARY_PATH ${IE_CPACK_IE_DIR}/lib/${ARCH} PARENT_SCOPE)
+        set(IE_CPACK_RUNTIME_PATH ${IE_CPACK_IE_DIR}/lib/${ARCH} PARENT_SCOPE)
+        set(IE_CPACK_ARCHIVE_PATH ${IE_CPACK_IE_DIR}/lib/${ARCH} PARENT_SCOPE)
     endif()
 endfunction()
 
@@ -59,8 +63,10 @@ macro(ie_cpack)
     set(CPACK_GENERATOR "TGZ")
     if(WIN32)
         set(CPACK_PACKAGE_NAME inference-engine_${CMAKE_BUILD_TYPE})
+        string(REPLACE "\\" "_" CPACK_PACKAGE_VERSION "${CI_BUILD_NUMBER}")
     else()
         set(CPACK_PACKAGE_NAME inference-engine)
+        string(REPLACE "/" "_" CPACK_PACKAGE_VERSION "${CI_BUILD_NUMBER}")
     endif()
     set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
     set(CPACK_ARCHIVE_COMPONENT_INSTALL ON)
index ae4161d..0dfc1ce 100644 (file)
@@ -118,7 +118,6 @@ function(addIeTarget)
     if (ARG_ADD_CPPLINT)
         # code style
         add_cpplint_target(${ARG_NAME}_cpplint FOR_TARGETS ${ARG_NAME})
-        add_clang_format_target(${ARG_NAME}_clang_format FOR_TARGETS ${ARG_NAME})
     endif()
     if (ARG_DEVELOPER_PACKAGE)
         # developer package
index 595d139..ded2740 100644 (file)
@@ -35,10 +35,6 @@ function(add_clang_format_target TARGET_NAME)
     set(multiValueArgs "FOR_TARGETS" "FOR_SOURCES" "EXCLUDE_PATTERNS")
     cmake_parse_arguments(CLANG_FORMAT "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    if(CLANG_FORMAT_ALL)
-        set(all ALL)
-    endif()
-
     foreach(target IN LISTS CLANG_FORMAT_FOR_TARGETS)
         get_target_property(target_sources "${target}" SOURCES)
         list(APPEND CLANG_FORMAT_FOR_SOURCES ${target_sources})
@@ -95,7 +91,6 @@ function(add_clang_format_target TARGET_NAME)
         "All clang-format output files")
 
     add_custom_target(${TARGET_NAME}
-        ${all}
         DEPENDS ${all_output_files}
         COMMENT "[clang-format] ${TARGET_NAME}")
 
index 2740691..cd5a520 100644 (file)
@@ -4,6 +4,8 @@
 
 cmake_policy(SET CMP0054 NEW)
 
+include(models)
+
 #we have number of dependencies stored on ftp
 include(dependency_solver)
 
@@ -13,6 +15,23 @@ endif()
 
 include(ExternalProject)
 
+if (ENABLE_SAME_BRANCH_FOR_MODELS)
+    branchName(MODELS_BRANCH)
+else()
+    set(MODELS_BRANCH "master")
+endif()
+
+
+if (ENABLE_DATA)
+    add_models_repo(${ENABLE_DATA} "data:inference-engine/open-source-data.git")
+    set(MODELS_PATH "${TEMP}/data/src/data")
+    set(DATA_PATH "${MODELS_PATH}")
+endif()
+
+message(STATUS "MODELS_PATH=" ${MODELS_PATH})
+
+fetch_models_and_validation_set()
+
 include(linux_name)
 if(COMMAND get_linux_name)
     get_linux_name(LINUX_OS_NAME)
index 966feef..26fe61c 100644 (file)
@@ -11,7 +11,11 @@ file(TO_CMAKE_PATH "${CMAKE_CURRENT_LIST_DIR}" cache_path)
 
 set(ie_options "@IE_OPTIONS@;CMAKE_BUILD_TYPE;CMAKE_SKIP_RPATH")
 
-load_cache("${cache_path}" READ_WITH_PREFIX "" ${ie_options})
+foreach(option IN LISTS ie_options)
+    if(NOT DEFINED "${option}")
+        load_cache("${cache_path}" READ_WITH_PREFIX "" ${option})
+     endif()
+endforeach()
 
 message(STATUS "The following CMake options are exported from Inference Engine Developer package")
 message("")
index 112371c..9749c21 100644 (file)
@@ -78,7 +78,9 @@ ie_dependent_option (GAPI_TEST_PERF "if GAPI unit tests should examine performan
 
 ie_dependent_option (ENABLE_MYRIAD_MVNC_TESTS "functional and behavior tests for mvnc api" OFF "ENABLE_TESTS;ENABLE_MYRIAD" OFF)
 
-ie_dependent_option (ENABLE_SAMPLES "console samples are part of inference engine package" ON "NOT MINGW" OFF)
+ie_dependent_option (ENABLE_DATA "fetch models from open-source-data repo" ON "ENABLE_FUNCTIONAL_TESTS;NOT ANDROID" OFF)
+
+ie_dependent_option (ENABLE_SAME_BRANCH_FOR_MODELS "uses same branch for models and for inference engine, if not enabled models are taken from master" OFF "ENABLE_TESTS" OFF)
 
 ie_dependent_option (ENABLE_BEH_TESTS "tests oriented to check inference engine API corecteness" ON "ENABLE_TESTS" OFF)
 
diff --git a/inference-engine/cmake/models.cmake b/inference-engine/cmake/models.cmake
new file mode 100644 (file)
index 0000000..3203068
--- /dev/null
@@ -0,0 +1,80 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+if(ENABLE_DOCKER)
+    cmake_minimum_required(VERSION 3.3 FATAL_ERROR)
+else()
+    cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+endif()
+
+cmake_policy(SET CMP0054 NEW)
+
+find_package(Git REQUIRED)
+
+set(MODELS_LST "")
+set(MODELS_LST_TO_FETCH "")
+
+function (add_models_repo add_to_fetcher model_name)
+    list(LENGTH ARGV add_models_args)
+    if (add_models_args EQUAL 3)
+        list(GET ARGV 2 branch_name)
+    else()
+        set(branch_name ${MODELS_BRANCH})
+    endif()
+    if (add_to_fetcher)
+        set(model_name "${model_name}:${branch_name}")
+        list(APPEND MODELS_LST_TO_FETCH ${model_name})
+    endif()
+
+    list(APPEND MODELS_LST ${model_name})
+
+    set(MODELS_LST_TO_FETCH ${MODELS_LST_TO_FETCH} PARENT_SCOPE)
+    set(MODELS_LST ${MODELS_LST} PARENT_SCOPE)
+endfunction()
+
+function(add_lfs_repo name prefix url tag)
+    ExternalProject_Add(${name}
+        PREFIX ${prefix}
+        GIT_REPOSITORY ${url}
+        GIT_TAG ${tag}
+        GIT_CONFIG "http.sslverify=false"
+        GIT_PROGRESS 1
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ""
+        INSTALL_COMMAND ""
+        LOG_DOWNLOAD ON)
+
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} lfs install --local --force
+        WORKING_DIRECTORY ${prefix}/src/${name}
+        OUTPUT_VARIABLE lfs_output
+        RESULT_VARIABLE lfs_var)
+    if(lfs_var)
+        message(FATAL_ERROR [=[
+            Failed to setup Git LFS: ${lfs_output}
+            Git lfs must be installed in order to fetch models
+            Please install it from https://git-lfs.github.com/
+        ]=])
+    endif()
+endfunction()
+
+function (fetch_models_and_validation_set)
+    foreach(loop_var ${MODELS_LST_TO_FETCH})
+        string(REPLACE ":" ";" MODEL_CONFIG_LST ${loop_var})
+
+        list(GET MODEL_CONFIG_LST 0 folder_name)
+        list(GET MODEL_CONFIG_LST 1 repo_name)
+        list(GET MODEL_CONFIG_LST 2 branch_name)
+
+        string(FIND ${folder_name} "model" IS_MODEL)
+        if(${folder_name} MATCHES "model*")
+            set(FOLDER_NAME "/models/src")
+        endif()
+        add_lfs_repo(
+            "${folder_name}"
+            ${TEMP}${FOLDER_NAME}/${folder_name}
+            "git@gitlab-icv.inn.intel.com:${repo_name}"
+            "${branch_name}")
+    endforeach(loop_var)
+endfunction()
index c4a0354..75ecd3c 100644 (file)
@@ -90,8 +90,8 @@ function(ie_add_plugin)
         ie_cpack_add_component(${install_component} REQUIRED DEPENDS core)
 
         install(TARGETS ${IE_PLUGIN_NAME}
-            RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT ${install_component}
-            ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT ${install_component}
+            RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT ${install_component}
+            ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT ${install_component}
             LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT ${install_component})
     endif()
 endfunction()
index 79f7391..d0b5e20 100644 (file)
@@ -104,4 +104,4 @@ if(ANDROID)
     set(LIBUSB_LIBRARY "${LIBUSB}/libs/${ANDROID_ABI}/libusb1.0.so")
 
     log_rpath_from_dir(LIBUSB "${LIBUSB}/libs/${ANDROID_ABI}")
-endif()
\ No newline at end of file
+endif()
index b82618a..2684412 100644 (file)
@@ -1,5 +1,5 @@
 // Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
+// SPDX-License-Identifier : Apache-2.0
 //
 
 #include <stdlib.h>
index 15a2b53..bf599e6 100644 (file)
@@ -1,5 +1,5 @@
 // Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
+// SPDX-License-Identifier : Apache-2.0
 //
 
 #include <stdlib.h>
index ca7d92e..ef8527a 100644 (file)
@@ -28,8 +28,8 @@ export(TARGETS ${TARGET_NAME} NAMESPACE IE:: APPEND FILE "${CMAKE_BINARY_DIR}/ta
 # install
 
 install(TARGETS ${TARGET_NAME}
-        RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
-        ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+        RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+        ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
         LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
 
 install(DIRECTORY ${InferenceEngine_C_API_SOURCE_DIR}/include/
index 676bfae..96af9de 100644 (file)
@@ -29,15 +29,18 @@ def build_argparser():
     args = parser.add_argument_group("Options")
     args.add_argument('-h', '--help', action='help', default=SUPPRESS, help='Show this help message and exit.')
     args.add_argument("-m", "--model", help="Required. Path to an .xml file with a trained model.",
-        required=True, type=str)
+                      required=True, type=str)
     args.add_argument("-i", "--input", help="Required. Path to image file.",
-        required=True, type=str, nargs="+")
+                      required=True, type=str, nargs="+")
     args.add_argument("-l", "--cpu_extension",
-        help="Optional. Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.",
-        type=str, default=None)
+                      help="Optional. Required for CPU custom layers. "
+                           "Absolute path to a shared library with the kernels implementations.",
+                      type=str, default=None)
     args.add_argument("-d", "--device",
-        help="Optional. Specify the target device to infer on; CPU, GPU, FPGA or MYRIAD is acceptable. Sample will look for a suitable plugin for device specified (CPU by default)",
-        default="CPU", type=str)
+                      help="Optional. Specify the target device to infer on; "
+                           "CPU, GPU, FPGA or MYRIAD is acceptable. "
+                           "Sample will look for a suitable plugin for device specified (CPU by default)",
+                      default="CPU", type=str)
     args.add_argument("--labels", help="Optional. Labels mapping file", default=None, type=str)
     args.add_argument("-nt", "--number_top", help="Optional. Number of top results", default=10, type=int)
 
@@ -59,9 +62,10 @@ def main():
     # ------------- 2. Load Plugin for inference engine and extensions library if specified --------------
     log.info("Device info:")
     versions = ie.get_versions(args.device)
-    print("{}{}".format(" "*8, args.device))
-    print("{}MKLDNNPlugin version ......... {}.{}".format(" "*8, versions[args.device].major, versions[args.device].minor))
-    print("{}Build ........... {}".format(" "*8, versions[args.device].build_number))
+    print("{}{}".format(" " * 8, args.device))
+    print("{}MKLDNNPlugin version ......... {}.{}".format(" " * 8, versions[args.device].major,
+                                                          versions[args.device].minor))
+    print("{}Build ........... {}".format(" " * 8, versions[args.device].build_number))
 
     if args.cpu_extension and "CPU" in args.device:
         ie.add_extension(args.cpu_extension, "CPU")
@@ -79,8 +83,15 @@ def main():
     # -----------------------------------------------------------------------------------------------------
 
     # --------------------------- 3. Read and preprocess input --------------------------------------------
-    input_blob = next(iter(net.inputs))
-    n, c, h, w = net.inputs[input_blob].shape
+
+    print("inputs number: " + str(len(net.inputs.keys())))
+
+    for input_key in net.inputs:
+        print("input shape: " + str(net.inputs[input_key].shape))
+        print("input key: " + input_key)
+        if len(net.inputs[input_key].layout) == 4:
+            n, c, h, w = net.inputs[input_key].shape
+
     images = np.ndarray(shape=(n, c, h, w))
     images_hw = []
     for i in range(n):
@@ -94,13 +105,14 @@ def main():
             log.warning("Image {} is resized from {} to {}".format(args.input[i], image.shape[:-1], (h, w)))
         image = image.transpose((2, 0, 1))  # Change data layout from HWC to CHW
         images[i] = image
+
     # -----------------------------------------------------------------------------------------------------
 
     # --------------------------- 4. Configure input & output ---------------------------------------------
     # --------------------------- Prepare input blobs -----------------------------------------------------
     log.info("Preparing input blobs")
-    assert (len(net.inputs.keys()) == 1 or len(net.inputs.keys()) == 2), "Sample supports topologies only with 1 or 2 inputs"
-    input_blob = next(iter(net.inputs))
+    assert (len(net.inputs.keys()) == 1 or len(
+        net.inputs.keys()) == 2), "Sample supports topologies only with 1 or 2 inputs"
     out_blob = next(iter(net.outputs))
     input_name, input_info_name = "", ""
 
@@ -112,9 +124,21 @@ def main():
         elif len(net.inputs[input_key].layout) == 2:
             input_info_name = input_key
             net.inputs[input_key].precision = 'FP32'
-            if net.inputs[input_key].shape[1] != 3 and net.inputs[input_key].shape[1] != 6 or net.inputs[input_key].shape[0] != 1:
+            if net.inputs[input_key].shape[1] != 3 and net.inputs[input_key].shape[1] != 6 or \
+                net.inputs[input_key].shape[0] != 1:
                 log.error('Invalid input info. Should be 3 or 6 values length.')
 
+    data = {}
+    data[input_name] = images
+
+    if input_info_name != "":
+        infos = np.ndarray(shape=(n, c), dtype=float)
+        for i in range(n):
+            infos[i, 0] = h
+            infos[i, 1] = w
+            infos[i, 2] = 1.0
+        data[input_info_name] = infos
+
     # --------------------------- Prepare output blobs ----------------------------------------------------
     log.info('Preparing output blobs')
 
@@ -141,7 +165,7 @@ def main():
     log.info("Loading model to the device")
     exec_net = ie.load_network(network=net, device_name=args.device)
     log.info("Creating infer request and starting inference")
-    res = exec_net.infer(inputs={input_blob: images})
+    res = exec_net.infer(inputs=data)
     # -----------------------------------------------------------------------------------------------------
 
     # --------------------------- Read and postprocess output ---------------------------------------------
@@ -159,8 +183,8 @@ def main():
             ymin = np.int(ih * proposal[4])
             xmax = np.int(iw * proposal[5])
             ymax = np.int(ih * proposal[6])
-            print("[{},{}] element, prob = {:.6}    ({},{})-({},{}) batch id : {}"\
-                .format(number, label, confidence, xmin, ymin, xmax, ymax, imid), end="")
+            print("[{},{}] element, prob = {:.6}    ({},{})-({},{}) batch id : {}" \
+                  .format(number, label, confidence, xmin, ymin, xmax, ymax, imid), end="")
             if proposal[2] > 0.5:
                 print(" WILL BE PRINTED!")
                 if not imid in boxes.keys():
@@ -181,7 +205,8 @@ def main():
     # -----------------------------------------------------------------------------------------------------
 
     log.info("Execution successful\n")
-    log.info("This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool")
+    log.info(
+        "This sample is an API example, for any performance measurements please use the dedicated benchmark_app tool")
 
 
 if __name__ == '__main__':
index 50b2756..042091d 100644 (file)
@@ -39,6 +39,16 @@ add_custom_command(TARGET ${TARGET_NAME}
     COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../__init__.py
 )
 
+# creates a folder in openvino directory and a symlink to benchmark
+# inside bin directory for developers for running python benchmark_app
+if(UNIX)
+    add_custom_command(TARGET ${TARGET_NAME}
+        POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../tools
+)
+    file(COPY ${OpenVINO_MAIN_SOURCE_DIR}/tools/benchmark DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/../tools/)
+endif()
+
 # install
 
 install(TARGETS ${TARGET_NAME}
index d9c7c56..648fa6a 100644 (file)
@@ -171,9 +171,9 @@ cdef class IECore:
     #
     #  Usage example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
     #  ie = IECore()
-    #  exec_net = ie.load_network(network=net, device_name="CPU", num_requsts=2)
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
+    #  exec_net = ie.load_network(network=net, device_name="CPU", num_requests=2)
     #  ```
     cpdef ExecutableNetwork load_network(self, IENetwork network, str device_name, config=None, int num_requests=1):
         cdef ExecutableNetwork exec_net = ExecutableNetwork()
@@ -197,8 +197,8 @@ cdef class IECore:
     #  @return An `ExecutableNetwork` object
     #  Usage example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
     #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  exec_net = ie.load_network(network=net, device_name="MYRIAD", num_requsts=2)
     #  # export executable network
     #  exec_net.export(path_to_file_to_save)
@@ -226,8 +226,8 @@ cdef class IECore:
     #
     #  Usage example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
     #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  layers_map = ie.query_network(network=net, device_name="HETERO:GPU,CPU")
     #  ```
     def query_network(self, IENetwork network, str device_name, config=None):
@@ -238,12 +238,19 @@ cdef class IECore:
         return c_map_to_dict(res)
 
     ## Sets a configuration for a plugin
-    #  NOTE: When specifying a key value of a config, the "KEY_" prefix is omitted.
+    #
+    #  \note When specifying a key value of a config, the "KEY_" prefix is omitted.
+    #
     #  @param config: a dictionary of configuration parameters as keys and their values
     #  @param device_name: a device name of a target plugin
     #  @return None
     #
-    #  Usage examples: See the `set_affinity` method of the `IENetwork` class
+    #  Usage examples:\n
+    #  ```python
+    #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
+    #  ie.set_config({"DYN_BATCH_ENABLED": "YES"})
+    #  ```
     def set_config(self, config: dict, device_name: str):
         cdef map[string, string] c_config = dict_to_c_map(config)
         self.impl.setConfig(c_config, device_name.encode())
@@ -316,7 +323,9 @@ cdef class IECore:
 
     ## Gets a configuration dedicated to device behavior. The method targets to extract information
     #  which can be set via set_config method.
-    #  NOTE: When specifying a key value of a config, the "KEY_" prefix is omitted.
+    #
+    #  \note When specifying a key value of a config, the "KEY_" prefix is omitted.
+    #
     #  @param device_name: A name of a device to get a config value.
     #  @param config_name: A config name to request.
     #  @return A config value corresponding to a config key.
@@ -452,8 +461,8 @@ cdef class ExecutableNetwork:
     #
     #  Usage example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
     #  ie_core = IECore()
+    #  net = ie_core.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  exec_net = ie_core.load_network(net, device, num_requests=2)
     #  res = exec_net.infer({'data': img})
     #  res
@@ -531,8 +540,8 @@ cdef class ExecutableNetwork:
     #
     #  Usage example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
     #  ie_core = IECore()
+    #  net = ie_core.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  exec_net = ie_core.load_network(net, device, num_requsts=2)
     #  exec_graph = exec_net.get_exec_graph_info()
     #  ```
@@ -549,7 +558,7 @@ cdef class ExecutableNetwork:
     #  Usage example:\n
     #  ```python
     #  ie = IECore()
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  exec_net = ie.load_network(net, "CPU")
     #  exec_net.get_metric("NETWORK_NAME")
     #  ```
@@ -564,7 +573,7 @@ cdef class ExecutableNetwork:
     #  Usage example:\n
     #  ```python
     #  ie = IECore()
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  exec_net = ie.load_network(net, "CPU")
     #  exec_net.get_metric("DEVICE_ID")
     #  ```
@@ -576,8 +585,8 @@ cdef class ExecutableNetwork:
     #  @return None
     #
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
     #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  exec_net = ie.load_network(network=net, device_name="MYRIAD", num_requsts=2)
     #  exec_net.export(path_to_file_to_save)
     #  ```
@@ -632,8 +641,8 @@ cdef class InferRequest:
     #  Usage example:\n
     #  ```python
     #  callback = lambda status, py_data: print("Request with id {} finished with status {}".format(py_data, status))
-    #  net = IENetwork("./model.xml", "./model.bin")
     #  ie = IECore()
+    #  net = ie.read_network(model="./model.xml", weights="./model.bin")
     #  exec_net = ie.load_network(net, "CPU", num_requests=4)
     #  for id, req in enumerate(exec_net.requests):
     #      req.set_completion_callback(py_callback=callback, py_data=id)
@@ -662,7 +671,7 @@ cdef class InferRequest:
     #
     #  Usage example:\n
     #  ```python
-    #  exec_net = plugin.load(network=net, num_requests=2)
+    #  exec_net = ie_core.load_network(network=net, num_requests=2)
     #  exec_net.requests[0].infer({input_blob: image})
     #  res = exec_net.requests[0].outputs['prob']
     #  np.flip(np.sort(np.squeeze(res)),0)
@@ -683,7 +692,7 @@ cdef class InferRequest:
     #
     #  Usage example:\n
     #  ```python
-    #  exec_net = plugin.load(network=net, num_requests=2)
+    #  exec_net = ie_core.load_network(network=net, num_requests=2)
     #  exec_net.requests[0].async_infer({input_blob: image})
     #  request_status = exec_net.requests[0].wait()
     #  res = exec_net.requests[0].outputs['prob']
@@ -697,7 +706,8 @@ cdef class InferRequest:
 
     ## Waits for the result to become available. Blocks until specified timeout elapses or the result
     #  becomes available, whichever comes first.
-    #  NOTE: There are special values of the timeout parameter:
+    #
+    #  \note There are special values of the timeout parameter:
     #  * 0 - Immediately returns the inference status. It does not block or interrupt execution.
     #        To find statuses meaning, please refer to InferenceEngine::StatusCode in Inference Engine C++ documentation
     #  * -1 - Waits until inference result becomes available (default value)
@@ -724,12 +734,14 @@ cdef class InferRequest:
         return deref(self.impl).wait(<int64_t> timeout)
 
     ## Queries performance measures per layer to get feedback of what is the most time consuming layer.
-    #  NOTE: Performance counters data and format depends on the plugin
+    #
+    #  \note Performance counters data and format depends on the plugin
+    #
     #  @return Dictionary containing per-layer execution information.
     #
     #  Usage example:
     #  ```python
-    #  exec_net = plugin.load(network=net, num_requests=2)
+    #  exec_net = ie_core.load_network(network=net, num_requests=2)
     #  exec_net.requests[0].infer({input_blob: image})
     #  exec_net.requests[0].get_perf_counts()
     #  {'Conv2D': {'exec_type': 'jit_avx2_1x1',
@@ -780,18 +792,20 @@ cdef class InferRequest:
 
     ## Sets new batch size for certain infer request when dynamic batching is enabled in executable network
     #  that created this request.
-    #  NOTE: Support of dynamic batch size depends on the target plugin.
+    #
+    #  \note Support of dynamic batch size depends on the target plugin.
     #
     #  @param size: New batch size to be used by all the following inference calls for this request
     #  @return None
     #
     #  Usage example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+    #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  # Set max batch size
     #  net.batch = 10
-    #  plugin.set_config({"DYN_BATCH_ENABLED": "YES"})
-    #  exec_net = plugin.load(network=net)
+    #  ie.set_config({"DYN_BATCH_ENABLED": "YES"})
+    #  exec_net = ie.load_network(network=net)
     #  # Set batch size for certain network.
     #  # NOTE: Input data shape will not be changed, but will be used partially in inference which increases performance
     #  exec_net.requests[0].set_batch(2)
@@ -855,7 +869,11 @@ cdef class IENetLayer:
     def type(self):
         return deref(self._ptr).type.decode()
 
-    ## Layer base operating precision. Provides getter and setter interfaces.
+    ## \note This property is deprecated.
+    #  Please, use out_data property to access DataPtr objects for all output ports, which contains full
+    #  information about layer's output data including precision.
+    #
+    #  Layer base operating precision. Provides getter and setter interfaces.
     @property
     def precision(self):
         warnings.filterwarnings("always", category=DeprecationWarning)
@@ -874,8 +892,8 @@ cdef class IENetLayer:
     #  The affinity attribute provides getter and setter interfaces, so the layer affinity can be modified directly.
     #  For example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
     #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  layers_map = ie.query_network(network=net, device_name="HETERO:GPU,CPU")
     #  layers = net.layers
     #  for layer, device in layers_map.items():
@@ -922,8 +940,10 @@ cdef class IENetLayer:
                 input_to_list.append(deref(layer.second).name.decode())
         return input_to_list
 
-    ## Deprecated: use out_data property to access DataPtr objects for all output ports, which contains full
+    ## \note This property is deprecated.
+    # Please, use out_data property to access DataPtr objects for all output ports, which contains full
     # information about layer's output data including layout
+    #
     # Returns the layout of the layer output data on 1st port
     @property
     def layout(self):
@@ -936,8 +956,10 @@ cdef class IENetLayer:
         cdef C.DataPtr c_input = deref(self._ptr).outData[0]
         return layout_int_to_str_map[deref(c_input).getLayout()]
 
-    ## Deprecated: use out_data property to access DataPtr objects for all output ports, which contains full
+    ## \note This property is deprecated.
+    # Please, use out_data property to access DataPtr objects for all output ports, which contains full
     # information about layer's output data including shape
+    #
     # Return the list of dimension of the layer output data on 1st port
     @property
     def shape(self):
@@ -988,7 +1010,10 @@ cdef class IENetLayer:
             weights_buffer.reset(blob.second)
             blobs_map[blob.first.decode()] = weights_buffer.to_numpy()
         return blobs_map
-    ## Dictionary with layer weights, biases or custom blobs if any
+    ## \note This property is deprecated.
+    #  Please use blobs property instead.
+    #
+    #  Dictionary with layer weights, biases or custom blobs if any
     @property
     def weights(self):
         warnings.filterwarnings("always", category=DeprecationWarning)
@@ -1003,6 +1028,9 @@ cdef class IENetLayer:
 cdef class IENetwork:
     ## Class constructor
     #
+    #  \note Reading networks using IENetwork constructor is deprecated.
+    #  Please, use IECore.read_network() method instead.
+    #
     #  @param model: A `.xml` file of the IR or PyCapsule containing smart pointer to nGraph function.
     #                In case of passing a `.xml` file  attribute value can be a string path or bytes with file content
     #                depending on `init_from_buffer` attribute value
@@ -1100,8 +1128,9 @@ cdef class IENetwork:
     ## Batch size of the network. Provides getter and setter interfaces to get and modify the
     #  network batch size. For example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
-    #  print(et.batch_size)
+    #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
+    #  print(net.batch_size)
     #  net.batch_size = 4
     #  print(net.batch_size)
     #  print(net.inputs['data'].shape)
@@ -1109,7 +1138,9 @@ cdef class IENetwork:
     @property
     def batch_size(self):
         return self.impl.getBatch()
-    ## Deprecated: network precision does not make sence, use precision on egdes.
+    ## \note This property is deprecated:
+    #  network precision does not make sense, use precision on edges.
+    #
     #  Precision of the network
     @property
     def precision(self):
@@ -1139,13 +1170,16 @@ cdef class IENetwork:
             layers[deref(l).name.decode()] = net_l
         return layers
 
-    ## Deprecated: new Calibration Tool doesn't generate statistics
+    ## \note This property is deprecated.
+    #  New Calibration Tool doesn't generate statistics
+    #
     #  Returns `LayersStatsMap` object containing dictionary that maps network layer names to calibration statistics
     #  represented by `LayerStats`  objects.
     #
     #  Usage example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+    #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  net.stats.update({"conv1_2d" : LayserStats(min=(-25, -1, 0), max=(63, 124, 70)),
     #                    "conv2_2d" : LayserStats(min=(-5, -1, 0, 1, -7, 2), max=(63, 124, 70, 174, 99, 106))
     #                   })
@@ -1163,26 +1197,6 @@ cdef class IENetwork:
                                                          max=tuple(it.second["max".encode()]))
         return py_stats_map
 
-    ## NOTE: The function is deprecated. Please use the `IENetwork()` class constructor
-    #        to create valid instance of `IENetwork`.
-    #
-    #  Reads the model from the `.xml` and `.bin` files of the IR.
-    #
-    #  @param model: Path to `.xml` file  of the IR
-    #  @param weights: Path to `.bin` file  of the IR
-    #  @return An instance of the `IENetwork` class
-    @classmethod
-    def from_ir(cls, model: str, weights: str):
-        warnings.filterwarnings("always", category=DeprecationWarning)
-        warnings.warn("from_ir() method of IENetwork is deprecated. "
-                      "Please use IENetwork class constructor to create valid IENetwork instance",
-                      DeprecationWarning)
-        if not os.path.isfile(model):
-            raise Exception("Path to the model {} doesn't exists or it's a directory".format(model))
-        if not os.path.isfile(weights):
-            raise Exception("Path to the weights {} doesn't exists or it's a directory".format(weights))
-        cdef IENetwork net = IENetwork(model, weights)
-        return net
 
     ## Marks any intermediate layer as output layer to retrieve the inference results from the specified layers.
     #  @param outputs: List of layers to be set as model outputs. The list can contain strings with layer names to be set
@@ -1192,7 +1206,8 @@ cdef class IENetwork:
     #
     #  Usage example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+    #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  net.add_outputs(["conv5_1', conv2_1', (split_2, 1)])]
     #  ```
     def add_outputs(self, outputs):
@@ -1216,14 +1231,16 @@ cdef class IENetwork:
     #
     #  Usage example:
     #  ```python
-    #  net = IENetwork(model=path_to_model, weights=path_to_weights)
+    #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml, weights=path_to_bin)
     #  net.serialize(path_to_xml, path_to_bin)
     #  ```
     def serialize(self, path_to_xml, path_to_bin: str = ""):
         self.impl.serialize(path_to_xml.encode(), path_to_bin.encode())
 
     ## Reshapes the network to change spatial dimensions, batch size, or any dimension.
-    #  NOTE: Before using this method, make sure that the target shape is applicable for the network.
+    #
+    #  \note Before using this method, make sure that the target shape is applicable for the network.
     #        Changing the network shape to an arbitrary value may lead to unpredictable behaviour.
     #
     #  @param input_shapes: A dictionary that maps input layer names to tuples with the target shape
@@ -1231,7 +1248,8 @@ cdef class IENetwork:
     #
     #  Usage example:\n
     #  ```python
-    #  net = IENetwork(model=path_to_xml_file, weights=path_to_bin_file)
+    #  ie = IECore()
+    #  net = ie.read_network(model=path_to_xml_file, weights=path_to_bin_file)
     #  input_layer = next(iter(net.inputs))
     #  n, c, h, w = net.inputs[input_layer]
     #  net.reshape({input_layer: (n, c, h*2, w*2)}]
@@ -1255,9 +1273,11 @@ cdef class IENetwork:
     #     return self.impl.getFunction()
 
 ## This class is the main plugin interface and serves to initialize and configure the plugin.
+#
+#\note This class is deprecated: Use IECore instead
+#
 cdef class IEPlugin:
-    ## Deprecated: Use IECore instead
-    #  Class constructor
+    ##  Class constructor
     #
     #  @param device: Target device name. Supported devices: CPU, GPU, FPGA, MYRIAD, HETERO, MULTI
     #  @param plugin_dirs: List of paths to plugin directories
index 5731cb7..40f6264 100644 (file)
@@ -40,7 +40,7 @@ public:
     /**
      * @brief A default constructor
      */
-    CNNNetReader(): actual(shared_from_irelease(InferenceEngine::CreateCNNNetReader())) {
+    CNNNetReader(): actual(InferenceEngine::CreateCNNNetReaderPtr()) {
         if (actual == nullptr) {
             THROW_IE_EXCEPTION << "CNNNetReader was not initialized.";
         }
@@ -182,7 +182,7 @@ public:
     }
 
 private:
-    std::shared_ptr<ICNNNetReader> actual;
+    CNNNetReaderPtr actual;
     std::shared_ptr<CNNNetwork> network;
 };
 IE_SUPPRESS_DEPRECATED_END
index db6dde4..16fb7cd 100644 (file)
@@ -66,8 +66,11 @@ public:
      * @param reader Pointer to the ICNNNetReader object
      */
     IE_SUPPRESS_DEPRECATED_START
-    explicit CNNNetwork(std::shared_ptr<ICNNNetReader> reader): reader(reader), actual(reader->getNetwork(nullptr)) {
-        if (actual == nullptr) {
+    explicit CNNNetwork(CNNNetReaderPtr reader_): reader(reader_) {
+        if (reader == nullptr) {
+            THROW_IE_EXCEPTION << "ICNNNetReader was not initialized.";
+        }
+        if ((actual = reader->getNetwork(nullptr)) == nullptr) {
             THROW_IE_EXCEPTION << "CNNNetwork was not initialized.";
         }
     }
@@ -161,6 +164,15 @@ public:
     }
 
     /**
+     * @brief An overloaded operator cast to get pointer on current network
+     *
+     * @return A shared pointer of the current network
+     */
+    operator std::shared_ptr<ICNNNetwork>() {
+        return network;
+    }
+
+    /**
      * @brief An overloaded operator & to get current network
      *
      * @return An instance of the current network
@@ -183,6 +195,15 @@ public:
      *
      * @return constant nGraph function
      */
+    std::shared_ptr<ngraph::Function> getFunction() noexcept {
+        return actual->getFunction();
+    }
+
+    /**
+     * @brief Returns constant nGraph function
+     *
+     * @return constant nGraph function
+     */
     std::shared_ptr<const ngraph::Function> getFunction() const noexcept {
         return actual->getFunction();
     }
@@ -297,7 +318,7 @@ protected:
      * @brief Reader extra reference, might be nullptr
      */
     IE_SUPPRESS_DEPRECATED_START
-    std::shared_ptr<ICNNNetReader> reader;
+    CNNNetReaderPtr reader;
     IE_SUPPRESS_DEPRECATED_END
     /**
      * @brief Network extra interface, might be nullptr
index 8d07706..df219cd 100644 (file)
@@ -58,6 +58,7 @@ public:
         IE_SUPPRESS_DEPRECATED_END
     }
 
+private:
     /**
      * @brief Loads function from the library and returns a pointer to it
      * @param functionName Name of function to load
@@ -127,6 +128,15 @@ public:
     }
 
     /**
+     * @brief Constructs an object with existing loader
+     * @param so_loader Existing pointer to a library loader
+     */
+    explicit SOPointer(std::shared_ptr<Loader> so_loader)
+        : _so_loader(so_loader),
+          _pointedObj(details::shared_from_irelease(
+              SymbolLoader<Loader>(_so_loader).template instantiateSymbol<T>(SOCreatorTrait<T>::name))) {}
+
+    /**
      * @brief The copy-like constructor, can create So Pointer that dereferenced into child type if T is derived of U
      * @param that copied SOPointer object
      */
@@ -183,6 +193,7 @@ protected:
      * @brief Gets a smart pointer to the DLL
      */
     std::shared_ptr<Loader> _so_loader;
+
     /**
      * @brief Gets a smart pointer to the custom object
      */
index af79c19..21b611f 100644 (file)
 
 #include <map>
 #include <string>
+#include <vector>
 
 #include "details/ie_no_copy.hpp"
+#include "details/ie_so_pointer.hpp"
 #include "ie_api.h"
 #include "ie_blob.h"
 #include "ie_common.h"
@@ -118,14 +120,45 @@ public:
      * @return IR version number: 1 or 2
      */
     virtual int getVersion(ResponseDesc* resp) noexcept = 0;
+
+    virtual void addExtensions(const std::vector<InferenceEngine::IExtensionPtr>& ext) = 0;
+
+    /**
+     * @brief A virtual destructor.
+     */
+    ~ICNNNetReader() override = default;
+};
+
+IE_SUPPRESS_DEPRECATED_START
+
+namespace details {
+
+/**
+ * @brief This class defines the name of the fabric for creating an IHeteroInferencePlugin object in DLL
+ */
+template<>
+class SOCreatorTrait<ICNNNetReader> {
+public:
+    /**
+     * @brief A name of the fabric for creating IInferencePlugin object in DLL
+     */
+    static constexpr auto name = "CreateICNNNetReader";
 };
 
+}  // namespace details
+
+/**
+ * @brief A C++ helper to work with objects created by the IR readers plugin.
+ * Implements different interfaces.
+ */
+using CNNNetReaderPtr = InferenceEngine::details::SOPointer<ICNNNetReader, InferenceEngine::details::SharedObjectLoader>;
+
 /**
  * @brief Creates a CNNNetReader instance
- *
  * @return An object that implements the ICNNNetReader interface
  */
-IE_SUPPRESS_DEPRECATED_START
-INFERENCE_ENGINE_API(ICNNNetReader*) CreateCNNNetReader() noexcept;
+INFERENCE_ENGINE_API_CPP(CNNNetReaderPtr) CreateCNNNetReaderPtr() noexcept;
+
 IE_SUPPRESS_DEPRECATED_END
+
 }  // namespace InferenceEngine
index 68b67ca..c7eca27 100644 (file)
@@ -48,6 +48,12 @@ public:
     using Ptr = std::shared_ptr<ICNNNetwork>;
 
     /**
+     * @brief Returns nGraph function
+     * @return nGraph function
+     */
+    virtual std::shared_ptr<ngraph::Function> getFunction() noexcept = 0;
+
+    /**
      * @brief Returns constant nGraph function
      * @return constant nGraph function
      */
index 0bc591d..fa40cdd 100644 (file)
@@ -2290,23 +2290,20 @@ public:
 
 /**
  * @deprecated Migrate to IR v10 and work with ngraph::Function directly. The method will be removed in 2020.3
- * @brief This class represents a standard Scatter layer
+ * @brief This class represents a standard ScatterUpdate layer
  */
-class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ScatterLayer): public CNNLayer {
+class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ScatterUpdateLayer): public CNNLayer {
 public:
     /**
-     * @brief The axis in Dictionary to scatter Indexes from
-     */
-    int axis = 0;
-    /**
-     * @brief Creates a new ScatterLayer instance.
+     * @brief Creates a new ScatterUpdateLayer instance.
      */
     using CNNLayer::CNNLayer;
 
-    ~ScatterLayer() override;
+    ~ScatterUpdateLayer() override;
 };
 
 /**
+ * @deprecated Migrate to IR v10 and work with ngraph::Function directly. The method will be removed in 2020.3
  * @brief This class represents an onnx ExperimentalDetectronPriorGridGenerator Layer
  */
 class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ExperimentalDetectronPriorGridGeneratorLayer): public CNNLayer {
@@ -2341,6 +2338,23 @@ public:
 };
 
 /**
+ * @brief This class represents a standard ExperimentalDetectronTopKROIs layer
+ */
+class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ExperimentalDetectronTopKROIs): public CNNLayer {
+public:
+    /**
+     * @brief The maximum number of output rois
+     */
+    int max_rois = 0;
+    /**
+     * @brief Creates a new ExperimentalDetectronTopKROIs instance.
+     */
+    using CNNLayer::CNNLayer;
+
+    virtual ~ExperimentalDetectronTopKROIs();
+};
+
+/**
  * @brief This class represents an onnx ExperimentalDetectronGenerateProposalsSingleImage Layer
  */
 class INFERENCE_ENGINE_INTERNAL_CNNLAYER_CLASS(ExperimentalDetectronGenerateProposalsSingleImageLayer): public CNNLayer {
index f9ad11d..bb904ef 100644 (file)
@@ -21,6 +21,7 @@
 #include <vector>
 
 #include "ie_api.h"
+#include "ie_blob.h"
 
 namespace ngraph {
 
@@ -325,6 +326,7 @@ private:
 };
 
 #ifdef __clang__
+extern template struct INFERENCE_ENGINE_API_CLASS(InferenceEngine::Parameter::RealData<InferenceEngine::Blob::Ptr>);
 extern template struct INFERENCE_ENGINE_API_CLASS(InferenceEngine::Parameter::RealData<int>);
 extern template struct INFERENCE_ENGINE_API_CLASS(InferenceEngine::Parameter::RealData<bool>);
 extern template struct INFERENCE_ENGINE_API_CLASS(InferenceEngine::Parameter::RealData<float>);
index 317a911..49a6212 100644 (file)
@@ -348,5 +348,17 @@ DECLARE_CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS);
  */
 DECLARE_CONFIG_KEY(DUMP_EXEC_GRAPH_AS_DOT);
 
+
+/**
+ * @brief The name for setting to execute in bfloat16 precision whenever it is possible
+ *
+ * This option let plugin know to downscale the precision where it see performance benefits from
+ * bfloat16 execution
+ * Such option do not guarantee accuracy of the network, the accuracy in this mode should be
+ * verified separately by the user and basing on performance and accuracy results it should be
+ * user's decision to use this option or not to use
+ */
+DECLARE_CONFIG_KEY(ENFORCE_BF16);
+
 }  // namespace PluginConfigParams
 }  // namespace InferenceEngine
index 4252503..f0a05b5 100644 (file)
@@ -26,7 +26,8 @@ public:
         UNSPECIFIED = 255, /**< Unspecified value. Used by default */
         MIXED = 0,         /**< Mixed value. Can be received from network. No applicable for tensors */
         FP32 = 10,         /**< 32bit floating point value */
-        FP16 = 11,         /**< 16bit floating point value */
+        FP16 = 11,         /**< 16bit floating point value, 5 bit for exponent, 10 bit for mantisa */
+        BF16 = 12,         /**< 16bit floating point value, 8 bit for exponent, 7 bit for mantisa*/
         Q78 = 20,          /**< 16bit specific signed fixed point precision */
         I16 = 30,          /**< 16bit signed integer value */
         U8 = 40,           /**< 8bit unsigned integer value */
@@ -106,6 +107,7 @@ public:
             switch (precisionInfo.value) {
                 CASE(FP32, float);
                 CASE2(FP16, int16_t, uint16_t);
+                CASE2(BF16, int16_t, uint16_t);
                 CASE(I16, int16_t);
                 CASE(I32, int32_t);
                 CASE(I64, int64_t);
@@ -181,9 +183,9 @@ public:
         static std::unordered_map<std::string, ePrecision> names = {
 #define PRECISION_NAME(s) {#s, s}
             PRECISION_NAME(Q78),  PRECISION_NAME(U8),    PRECISION_NAME(I8),    PRECISION_NAME(I16),
-            PRECISION_NAME(I32),  PRECISION_NAME(I64),   PRECISION_NAME(U64),   PRECISION_NAME(U16),
+            PRECISION_NAME(I32),  PRECISION_NAME(I64),   PRECISION_NAME(U64),    PRECISION_NAME(U16),
             PRECISION_NAME(FP32), PRECISION_NAME(FP16),  PRECISION_NAME(MIXED), PRECISION_NAME(BIN),
-            PRECISION_NAME(BOOL),
+            PRECISION_NAME(BOOL), PRECISION_NAME(BF16),
 #undef PRECISION_NAME
         };
         auto i = names.find(str);
@@ -260,6 +262,7 @@ protected:
         switch (v) {
             CASE(FP32);
             CASE(FP16);
+            CASE(BF16);
             CASE(I16);
             CASE(I32);
             CASE(I64);
@@ -295,6 +298,10 @@ struct PrecisionTrait<Precision::FP16> {
     using value_type = int16_t;
 };
 template <>
+struct PrecisionTrait<Precision::BF16> {
+    using value_type = int16_t;
+};
+template<>
 struct PrecisionTrait<Precision::Q78> {
     using value_type = uint16_t;
 };
index ecc5ee6..7f675a7 100644 (file)
@@ -92,14 +92,17 @@ Options:
                               Please note that although the automatic selection usually provides a reasonable performance, 
                               it still may be non-optimal for some cases, especially for very small networks.
     -nthreads "<integer>"     Optional. Number of threads to use for inference on the CPU (including HETERO and MULTI cases).
-    -pin "YES"/"NUMA"/"NO"    Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO") 
-                              CPU threads pinning for CPU-involved inference.
+    -enforcebf16              Optional. Enforcing of floating point operations execution in bfloat16 precision where it is acceptable.
+    -pin "YES"/"NO"/"NUMA"    Optional. Enable threads->cores ("YES", default), threads->(NUMA)nodes ("NUMA") or completely disable ("NO") CPU threads pinning for CPU-involved inference.
+
 
   Statistics dumping options:
     -report_type "<type>"     Optional. Enable collecting statistics report. "no_counters" report contains configuration options specified, resulting FPS and latency. "average_counters" report extends "no_counters" report and additionally includes average PM counters values for each layer from the network. "detailed_counters" report extends "average_counters" report and additionally includes per-layer PM counters and latency for each executed infer request.
     -report_folder            Optional. Path to a folder where statistics report is stored.
     -exec_graph_path          Optional. Path to a file where to store executable graph information serialized.
     -pc                       Optional. Report performance counters.
+    -dump_config              Optional. Path to XML/YAML/JSON file to dump IE parameters, which were set by application.
+    -load_config              Optional. Path to XML/YAML/JSON file to load custom IE parameters. Please note, command line parameters have higher priority then parameters from configuration file.
 ```
 
 Running the application with the empty list of options yields the usage message given above and an error message.
index 6b01db6..cefff68 100644 (file)
@@ -48,6 +48,9 @@ static const char infer_num_streams_message[] = "Optional. Number of streams to
                                                 "usually provides a reasonable performance, it still may be non - optimal for some cases, especially for "
                                                 "very small networks. See sample's README for more details.";
 
+/// @brief message for enforcing of BF16 execution where it is possible
+static const char enforce_bf16_message[] = "Optional. Enforcing of floating point operations execution in bfloat16 precision where it is acceptable.";
+
 /// @brief message for user library argument
 static const char custom_cpu_library_message[] = "Required for CPU custom layers. Absolute path to a shared library with the kernels implementations.";
 
@@ -85,6 +88,15 @@ static const char progress_message[] = "Optional. Show progress bar (can affect
 // @brief message for performance counters option
 static const char pc_message[] = "Optional. Report performance counters.";
 
+#ifdef USE_OPENCV
+// @brief message for load config option
+static const char load_config_message[] = "Optional. Path to XML/YAML/JSON file to load custom IE parameters."
+                                          " Please note, command line parameters have higher priority then parameters from configuration file.";
+
+// @brief message for dump config option
+static const char dump_config_message[] = "Optional. Path to XML/YAML/JSON file to dump IE parameters, which were set by application.";
+#endif
+
 /// @brief Define flag for showing help message <br>
 DEFINE_bool(h, false, help_message);
 
@@ -130,6 +142,9 @@ DEFINE_uint32(nthreads, 0, infer_num_threads_message);
 /// @brief Number of streams to use for inference on the CPU (also affects Hetero cases)
 DEFINE_string(nstreams, "", infer_num_streams_message);
 
+/// @brief Enforces bf16 execution with bfloat16 precision on systems having this capability
+DEFINE_bool(enforcebf16, false, enforce_bf16_message);
+
 /// @brief Define parameter for batch size <br>
 /// Default is 0 (that means don't specify)
 DEFINE_uint32(b, 0, batch_size_message);
@@ -155,6 +170,14 @@ DEFINE_bool(progress, false, progress_message);
 /// @brief Define flag for showing performance counters <br>
 DEFINE_bool(pc, false, pc_message);
 
+#ifdef USE_OPENCV
+/// @brief Define flag for loading configuration file <br>
+DEFINE_string(load_config, "", load_config_message);
+
+/// @brief Define flag for dumping configuration file <br>
+DEFINE_string(dump_config, "", dump_config_message);
+#endif
+
 /**
 * @brief This function show a help message
 */
@@ -180,10 +203,15 @@ static void showUsage() {
     std::cout << std::endl << "  device-specific performance options:" << std::endl;
     std::cout << "    -nstreams \"<integer>\"     " << infer_num_streams_message << std::endl;
     std::cout << "    -nthreads \"<integer>\"     " << infer_num_threads_message << std::endl;
-    std::cout << "    -pin \"YES\"/\"NO\"           " << infer_threads_pinning_message << std::endl;
+    std::cout << "    -enforcebf16              " << enforce_bf16_message << std::endl;
+    std::cout << "    -pin \"YES\"/\"NO\"/\"NUMA\"    " << infer_threads_pinning_message << std::endl;
     std::cout << std::endl << "  Statistics dumping options:" << std::endl;
     std::cout << "    -report_type \"<type>\"     " << report_type_message << std::endl;
     std::cout << "    -report_folder            " << report_folder_message << std::endl;
     std::cout << "    -exec_graph_path          " << exec_graph_path_message << std::endl;
     std::cout << "    -pc                       " << pc_message << std::endl;
+#ifdef USE_OPENCV
+    std::cout << "    -dump_config              " << dump_config_message << std::endl;
+    std::cout << "    -load_config              " << load_config_message << std::endl;
+#endif
 }
index 34b80d0..be7325b 100644 (file)
@@ -55,9 +55,9 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) {
     }
 
     if (!FLAGS_report_type.empty() &&
-         FLAGS_report_type != noCntReport && FLAGS_report_type != averageCntReport && FLAGS_report_type != detailedCntReport) {
+        FLAGS_report_type != noCntReport && FLAGS_report_type != averageCntReport && FLAGS_report_type != detailedCntReport) {
         std::string err = "only " + std::string(noCntReport) + "/" + std::string(averageCntReport) + "/" + std::string(detailedCntReport) +
-                " report types are supported (invalid -report_type option value)";
+                          " report types are supported (invalid -report_type option value)";
         throw std::logic_error(err);
     }
 
@@ -71,17 +71,17 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) {
 static void next_step(const std::string additional_info = "") {
     static size_t step_id = 0;
     static const std::map<size_t, std::string> step_names = {
-      { 1, "Parsing and validating input arguments" },
-      { 2, "Loading Inference Engine" },
-      { 3, "Setting device configuration" },
-      { 4, "Reading the Intermediate Representation network" },
-      { 5, "Resizing network to match image sizes and given batch" },
-      { 6, "Configuring input of the model" },
-      { 7, "Loading the model to the device" },
-      { 8, "Setting optimal runtime parameters" },
-      { 9, "Creating infer requests and filling input blobs with images" },
-      { 10, "Measuring performance" },
-      { 11, "Dumping statistics report" }
+            { 1, "Parsing and validating input arguments" },
+            { 2, "Loading Inference Engine" },
+            { 3, "Setting device configuration" },
+            { 4, "Reading the Intermediate Representation network" },
+            { 5, "Resizing network to match image sizes and given batch" },
+            { 6, "Configuring input of the model" },
+            { 7, "Loading the model to the device" },
+            { 8, "Setting optimal runtime parameters" },
+            { 9, "Creating infer requests and filling input blobs with images" },
+            { 10, "Measuring performance" },
+            { 11, "Dumping statistics report" }
     };
 
     step_id++;
@@ -121,38 +121,46 @@ int main(int argc, char *argv[]) {
             slog::info << "Network is compiled" << slog::endl;
         }
 
-        if (!FLAGS_report_type.empty()) {
-            std::vector<gflags::CommandLineFlagInfo> flags;
-            StatisticsReport::Parameters command_line_arguments;
-            gflags::GetAllFlags(&flags);
-
-            for (auto &flag : flags) {
-                if (!flag.is_default) {
-                    command_line_arguments.push_back({ flag.name, flag.current_value });
-                }
+        std::vector<gflags::CommandLineFlagInfo> flags;
+        StatisticsReport::Parameters command_line_arguments;
+        gflags::GetAllFlags(&flags);
+        for (auto &flag : flags) {
+            if (!flag.is_default) {
+                command_line_arguments.push_back({ flag.name, flag.current_value });
             }
+        }
+        if (!FLAGS_report_type.empty()) {
             statistics = std::make_shared<StatisticsReport>(StatisticsReport::Config{FLAGS_report_type, FLAGS_report_folder});
             statistics->addParameters(StatisticsReport::Category::COMMAND_LINE_PARAMETERS, command_line_arguments);
         }
+        auto isFlagSetInCommandLine = [&command_line_arguments] (const std::string& name) {
+           return (std::find_if(command_line_arguments.begin(), command_line_arguments.end(),
+           [ name ] (const std::pair<std::string, std::string>& p) { return p.first == name;}) != command_line_arguments.end());
+        };
+
+        std::string device_name = FLAGS_d;
+
+        // Parse devices
+        auto devices = parseDevices(device_name);
 
+        // Parse nstreams per device
+        std::map<std::string, std::string> device_nstreams = parseNStreamsValuePerDevice(devices, FLAGS_nstreams);
+
+        // Load device config file if specified
+        std::map<std::string, std::map<std::string, std::string>> config;
+#ifdef USE_OPENCV
+        if (!FLAGS_load_config.empty()) {
+            load_config(FLAGS_load_config, config);
+        }
+#endif
         /** This vector stores paths to the processed images **/
         std::vector<std::string> inputFiles;
         parseInputFilesArguments(inputFiles);
 
-        if (FLAGS_nstreams.empty()) {
-            slog::warn << "-nstreams default value is determined automatically for a device. "
-                "Although the automatic selection usually provides a reasonable performance,"
-                "but it still may be non-optimal for some cases, for more information look at README." << slog::endl<< slog::endl;
-        }
-
         // ----------------- 2. Loading the Inference Engine -----------------------------------------------------------
         next_step();
 
-        // Get optimal runtime parameters for device
-        std::string device_name = FLAGS_d;
-
         Core ie;
-
         if (FLAGS_d.find("CPU") != std::string::npos && !FLAGS_l.empty()) {
             // CPU (MKLDNN) extensions is loaded as a shared library and passed as a pointer to base extension
             const auto extension_ptr = InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l);
@@ -160,10 +168,17 @@ int main(int argc, char *argv[]) {
             slog::info << "CPU (MKLDNN) extensions is loaded " << FLAGS_l << slog::endl;
         }
 
+        // Load clDNN Extensions
         if ((FLAGS_d.find("GPU") != std::string::npos) && !FLAGS_c.empty()) {
-            // Load clDNN Extensions
-            ie.SetConfig({ {CONFIG_KEY(CONFIG_FILE), FLAGS_c} });
-            slog::info << "GPU extensions is loaded " << FLAGS_c << slog::endl;
+            // Override config if command line parameter is specified
+            if (!config.count("GPU"))
+                config["GPU"] = {};
+            config["GPU"][CONFIG_KEY(CONFIG_FILE)] = FLAGS_c;
+        }
+        if (config.count("GPU") && config.at("GPU").count(CONFIG_KEY(CONFIG_FILE))) {
+            auto ext = config.at("GPU").at(CONFIG_KEY(CONFIG_FILE));
+            ie.SetConfig({{ CONFIG_KEY(CONFIG_FILE), ext }}, "GPU");
+            slog::info << "GPU extensions is loaded " << ext << slog::endl;
         }
 
         slog::info << "InferenceEngine: " << GetInferenceEngineVersion() << slog::endl;
@@ -173,70 +188,108 @@ int main(int argc, char *argv[]) {
         // ----------------- 3. Setting device configuration -----------------------------------------------------------
         next_step();
 
-        bool perf_counts = (FLAGS_report_type == detailedCntReport ||
-                            FLAGS_report_type == averageCntReport ||
-                            FLAGS_pc ||
-                            !FLAGS_exec_graph_path.empty());
-
-        auto devices = parseDevices(device_name);
-        std::map<std::string, uint32_t> device_nstreams = parseNStreamsValuePerDevice(devices, FLAGS_nstreams);
-        for (auto& pair : device_nstreams) {
-            auto key = std::string(pair.first + "_THROUGHPUT_STREAMS");
-            std::vector<std::string> supported_config_keys = ie.GetMetric(pair.first, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
-            if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) {
-                 throw std::logic_error("Device " + pair.first + " doesn't support config key '" + key + "'! " +
-                                        "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>");
+        bool perf_counts = false;
+        // Update config per device according to command line parameters
+        for (auto& device : devices) {
+            if (!config.count(device)) config[device] = {};
+            std::map<std::string, std::string>& device_config = config.at(device);
+
+            // Set performance counter
+            if (isFlagSetInCommandLine("pc")) {
+                // set to user defined value
+                device_config[CONFIG_KEY(PERF_COUNT)] = FLAGS_pc ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO);
+            } else if (device_config.count(CONFIG_KEY(PERF_COUNT)) &&
+                      (device_config.at(CONFIG_KEY(PERF_COUNT)) == "YES")) {
+                slog::warn << "Performance counters for " << device <<
+                              " device is turned on. To print results use -pc option." << slog::endl;
+            } else if (FLAGS_report_type == detailedCntReport || FLAGS_report_type == averageCntReport) {
+                slog::warn << "Turn on performance counters for " << device <<
+                              " device since report type is " << FLAGS_report_type << "." << slog::endl;
+                device_config[CONFIG_KEY(PERF_COUNT)] = CONFIG_VALUE(YES);
+            } else if (!FLAGS_exec_graph_path.empty()) {
+                slog::warn << "Turn on performance counters for " << device <<
+                              " device due to execution graph dumping." << slog::endl;
+                device_config[CONFIG_KEY(PERF_COUNT)] = CONFIG_VALUE(YES);
+            } else {
+                // set to default value
+                device_config[CONFIG_KEY(PERF_COUNT)] = FLAGS_pc ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO);
             }
-        }
+            perf_counts = (device_config.at(CONFIG_KEY(PERF_COUNT)) == CONFIG_VALUE(YES)) ? true : perf_counts;
+
+            auto setThroughputStreams = [&] () {
+                const std::string key = device + "_THROUGHPUT_STREAMS";
+                if (device_nstreams.count(device)) {
+                    // set to user defined value
+                    std::vector<std::string> supported_config_keys = ie.GetMetric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
+                    if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) {
+                        throw std::logic_error("Device " + device + " doesn't support config key '" + key + "'! " +
+                                               "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>" +
+                                               " or via configuration file.");
+                    }
+                    device_config[key] = device_nstreams.at(device);
+                } else if (!device_config.count(key) && (FLAGS_api == "async")) {
+                    slog::warn << "-nstreams default value is determined automatically for " << device << " device. "
+                          "Although the automatic selection usually provides a reasonable performance,"
+                          "but it still may be non-optimal for some cases, for more information look at README." << slog::endl;
+                    device_config[key] = std::string(device + "_THROUGHPUT_AUTO");
+                }
+                if (device_config.count(key))
+                    device_nstreams[device] = device_config.at(key);
+            };
 
-        for (auto& device : devices) {
             if (device == "CPU") {  // CPU supports few special performance-oriented keys
                 // limit threading for CPU portion of inference
-                if (FLAGS_nthreads != 0)
-                    ie.SetConfig({{ CONFIG_KEY(CPU_THREADS_NUM), std::to_string(FLAGS_nthreads) }}, device);
-
-                if ((device_name.find("MULTI") != std::string::npos) &&
-                    (device_name.find("GPU") != std::string::npos)) {
-                    ie.SetConfig({{ CONFIG_KEY(CPU_BIND_THREAD), CONFIG_VALUE(NO) }}, device);
-                } else {
-                    // pin threads for CPU portion of inference
-                    ie.SetConfig({{ CONFIG_KEY(CPU_BIND_THREAD), FLAGS_pin }}, device);
+                if (isFlagSetInCommandLine("nthreads"))
+                    device_config[CONFIG_KEY(CPU_THREADS_NUM)] = std::to_string(FLAGS_nthreads);
+
+                if (isFlagSetInCommandLine("enforcebf16"))
+                    device_config[CONFIG_KEY(ENFORCE_BF16)] = FLAGS_enforcebf16 ? CONFIG_VALUE(YES) : CONFIG_VALUE(NO);
+
+                if (isFlagSetInCommandLine("pin")) {
+                    // set to user defined value
+                    device_config[CONFIG_KEY(CPU_BIND_THREAD)] = FLAGS_pin;
+                } else if (!device_config.count(CONFIG_KEY(CPU_BIND_THREAD))) {
+                    if ((device_name.find("MULTI") != std::string::npos) &&
+                        (device_name.find("GPU") != std::string::npos)) {
+                         slog::warn << "Turn off threads pinning for " << device <<
+                                       " device since multi-scenario with GPU device is used." << slog::endl;
+                        device_config[CONFIG_KEY(CPU_BIND_THREAD)] = CONFIG_VALUE(NO);
+                    } else {
+                        // set to default value
+                        device_config[CONFIG_KEY(CPU_BIND_THREAD)] = FLAGS_pin;
+                    }
                 }
 
                 // for CPU execution, more throughput-oriented execution via streams
-                if (FLAGS_api == "async")
-                    ie.SetConfig({{ CONFIG_KEY(CPU_THROUGHPUT_STREAMS),
-                                    (device_nstreams.count(device) > 0 ? std::to_string(device_nstreams.at(device)) :
-                                                                         "CPU_THROUGHPUT_AUTO") }}, device);
-                device_nstreams[device] = std::stoi(ie.GetConfig(device, CONFIG_KEY(CPU_THROUGHPUT_STREAMS)).as<std::string>());
+                setThroughputStreams();
             } else if (device == ("GPU")) {
-                if (FLAGS_api == "async")
-                    ie.SetConfig({{ CONFIG_KEY(GPU_THROUGHPUT_STREAMS),
-                                    (device_nstreams.count(device) > 0 ? std::to_string(device_nstreams.at(device)) :
-                                                                         "GPU_THROUGHPUT_AUTO") }}, device);
-                device_nstreams[device] = std::stoi(ie.GetConfig(device, CONFIG_KEY(GPU_THROUGHPUT_STREAMS)).as<std::string>());
+                // for GPU execution, more throughput-oriented execution via streams
+                setThroughputStreams();
 
                 if ((device_name.find("MULTI") != std::string::npos) &&
                     (device_name.find("CPU") != std::string::npos)) {
-                    // multi-device execution with the CPU + GPU performs best with GPU trottling hint,
-                    // which releases another CPU thread (that is otherwise used by the GPU driver for active polling)
-                    ie.SetConfig({{ CLDNN_CONFIG_KEY(PLUGIN_THROTTLE), "1" }}, "GPU");
+                    slog::warn << "Turn on GPU trottling. Multi-device execution with the CPU + GPU performs best with GPU trottling hint," <<
+                                  "which releases another CPU thread (that is otherwise used by the GPU driver for active polling)"<< slog::endl;
+                    device_config[CLDNN_CONFIG_KEY(PLUGIN_THROTTLE)] = "1";
                 }
             } else if (device == "MYRIAD") {
-                ie.SetConfig({{ CONFIG_KEY(LOG_LEVEL), CONFIG_VALUE(LOG_WARNING) }}, device);
+                device_config[CONFIG_KEY(LOG_LEVEL)] = CONFIG_VALUE(LOG_WARNING);
             }
         }
 
+        for (auto&& item : config) {
+            ie.SetConfig(item.second, item.first);
+        }
+
         auto double_to_string = [] (const double number) {
-                    std::stringstream ss;
-                    ss << std::fixed << std::setprecision(2) << number;
-                    return ss.str();
-                };
+            std::stringstream ss;
+            ss << std::fixed << std::setprecision(2) << number;
+            return ss.str();
+        };
         auto get_total_ms_time = [] (Time::time_point& startTime) {
             return std::chrono::duration_cast<ns>(Time::now() - startTime).count() * 0.000001;
         };
 
-
         size_t batchSize = FLAGS_b;
         Precision precision = Precision::UNSPECIFIED;
         std::string topology_name = "";
@@ -253,7 +306,7 @@ int main(int argc, char *argv[]) {
             if (statistics)
                 statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                           {
-                                              {"read network time (ms)", duration_ms}
+                                                  {"read network time (ms)", duration_ms}
                                           });
 
             const InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
@@ -305,17 +358,14 @@ int main(int argc, char *argv[]) {
             }
             // ----------------- 7. Loading the model to the device --------------------------------------------------------
             next_step();
-
-            std::map<std::string, std::string> config = {{ CONFIG_KEY(PERF_COUNT), perf_counts ? CONFIG_VALUE(YES) :
-                                                                                                CONFIG_VALUE(NO) }};
             startTime = Time::now();
-            exeNetwork = ie.LoadNetwork(cnnNetwork, device_name, config);
+            exeNetwork = ie.LoadNetwork(cnnNetwork, device_name);
             duration_ms = double_to_string(get_total_ms_time(startTime));
             slog::info << "Load network took " << duration_ms << " ms" << slog::endl;
             if (statistics)
                 statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                           {
-                                              {"load network time (ms)", duration_ms}
+                                                  {"load network time (ms)", duration_ms}
                                           });
         } else {
             next_step();
@@ -333,7 +383,7 @@ int main(int argc, char *argv[]) {
             if (statistics)
                 statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                           {
-                                              {"import network time (ms)", duration_ms}
+                                                  {"import network time (ms)", duration_ms}
                                           });
             if (batchSize == 0) {
                 batchSize = 1;
@@ -342,6 +392,12 @@ int main(int argc, char *argv[]) {
         // ----------------- 8. Setting optimal runtime parameters -----------------------------------------------------
         next_step();
 
+        // Update number of streams
+        for (auto&& ds : device_nstreams) {
+            const std::string key = ds.first + "_THROUGHPUT_STREAMS";
+            device_nstreams[ds.first] = ie.GetConfig(ds.first, key).as<std::string>();
+        }
+
         // Number of requests
         uint32_t nireq = FLAGS_nireq;
         if (nireq == 0) {
@@ -384,21 +440,21 @@ int main(int argc, char *argv[]) {
         if (statistics) {
             statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
                                       {
-                                            {"topology", topology_name},
-                                            {"target device", device_name},
-                                            {"API", FLAGS_api},
-                                            {"precision", std::string(precision.name())},
-                                            {"batch size", std::to_string(batchSize)},
-                                            {"number of iterations", std::to_string(niter)},
-                                            {"number of parallel infer requests", std::to_string(nireq)},
-                                            {"duration (ms)", std::to_string(getDurationInMilliseconds(duration_seconds))},
+                                              {"topology", topology_name},
+                                              {"target device", device_name},
+                                              {"API", FLAGS_api},
+                                              {"precision", std::string(precision.name())},
+                                              {"batch size", std::to_string(batchSize)},
+                                              {"number of iterations", std::to_string(niter)},
+                                              {"number of parallel infer requests", std::to_string(nireq)},
+                                              {"duration (ms)", std::to_string(getDurationInMilliseconds(duration_seconds))},
                                       });
             for (auto& nstreams : device_nstreams) {
                 std::stringstream ss;
                 ss << "number of " << nstreams.first << " streams";
                 statistics->addParameters(StatisticsReport::Category::RUNTIME_CONFIG,
                                           {
-                                                {ss.str(), std::to_string(nstreams.second)},
+                                                  {ss.str(), nstreams.second},
                                           });
             }
         }
@@ -511,23 +567,23 @@ int main(int argc, char *argv[]) {
         double latency = getMedianValue<double>(inferRequestsQueue.getLatencies());
         double totalDuration = inferRequestsQueue.getDurationInMilliseconds();
         double fps = (FLAGS_api == "sync") ? batchSize * 1000.0 / latency :
-                                             batchSize * 1000.0 * iteration / totalDuration;
+                     batchSize * 1000.0 * iteration / totalDuration;
 
         if (statistics) {
             statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                       {
-                                        {"total execution time (ms)", double_to_string(totalDuration)},
-                                        {"total number of iterations", std::to_string(iteration)},
+                                              {"total execution time (ms)", double_to_string(totalDuration)},
+                                              {"total number of iterations", std::to_string(iteration)},
                                       });
             if (device_name.find("MULTI") == std::string::npos) {
                 statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                           {
-                                            {"latency (ms)", double_to_string(latency)},
+                                                  {"latency (ms)", double_to_string(latency)},
                                           });
             }
             statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                       {
-                                          {"throughput", double_to_string(fps)}
+                                              {"throughput", double_to_string(fps)}
                                       });
         }
 
@@ -536,6 +592,13 @@ int main(int argc, char *argv[]) {
         // ----------------- 11. Dumping statistics report -------------------------------------------------------------
         next_step();
 
+#ifdef USE_OPENCV
+        if (!FLAGS_dump_config.empty()) {
+            dump_config(FLAGS_dump_config, config);
+            slog::info << "Inference Engine configuration settings were dumped to " << FLAGS_dump_config << slog::endl;
+        }
+#endif
+
         if (!FLAGS_exec_graph_path.empty()) {
             try {
                 CNNNetwork execGraphInfo = exeNetwork.GetExecGraphInfo();
@@ -575,7 +638,7 @@ int main(int argc, char *argv[]) {
         if (statistics) {
             statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS,
                                       {
-                                            {"error", ex.what()},
+                                              {"error", ex.what()},
                                       });
             statistics->dump();
         }
index a7c05d7..d0820bc 100644 (file)
 
 #include "utils.hpp"
 
+#ifdef USE_OPENCV
+#include <opencv2/core.hpp>
+#endif
+
 uint32_t deviceDefaultDeviceDurationInSeconds(const std::string& device) {
     static const std::map<std::string, uint32_t> deviceDefaultDurationInSeconds {
             { "CPU",     60  },
@@ -60,32 +64,33 @@ std::vector<std::string> parseDevices(const std::string& device_string) {
     if (comma_separated_devices.find(":") != std::string::npos) {
         comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1);
     }
+    if ((comma_separated_devices == "MULTI") || (comma_separated_devices == "HETERO"))
+        return std::vector<std::string>();
     auto devices = split(comma_separated_devices, ',');
     for (auto& device : devices)
         device = device.substr(0, device.find_first_of(".("));
     return devices;
 }
 
-std::map<std::string, uint32_t> parseNStreamsValuePerDevice(const std::vector<std::string>& devices,
-                                                            const std::string& values_string) {
+std::map<std::string, std::string> parseNStreamsValuePerDevice(const std::vector<std::string>& devices,
+                                                               const std::string& values_string) {
     //  Format: <device1>:<value1>,<device2>:<value2> or just <value>
-    auto values_string_upper = values_string;
-    std::map<std::string, uint32_t> result;
-    auto device_value_strings = split(values_string_upper, ',');
+    std::map<std::string, std::string> result;
+    auto device_value_strings = split(values_string, ',');
     for (auto& device_value_string : device_value_strings) {
-        auto device_value_vec =  split(device_value_string, ':');
+        auto device_value_vec = split(device_value_string, ':');
         if (device_value_vec.size() == 2) {
             auto device_name = device_value_vec.at(0);
             auto nstreams = device_value_vec.at(1);
             auto it = std::find(devices.begin(), devices.end(), device_name);
             if (it != devices.end()) {
-                result[device_name] = std::stoi(nstreams);
+                result[device_name] = nstreams;
             } else {
                 throw std::logic_error("Can't set nstreams value " + std::string(nstreams) +
                                        " for device '" + device_name + "'! Incorrect device name!");
             }
         } else if (device_value_vec.size() == 1) {
-            uint32_t value = std::stoi(device_value_vec.at(0));
+            auto value = device_value_vec.at(0);
             for (auto& device : devices) {
                 result[device] = value;
             }
@@ -95,3 +100,37 @@ std::map<std::string, uint32_t> parseNStreamsValuePerDevice(const std::vector<st
     }
     return result;
 }
+
+#ifdef USE_OPENCV
+void dump_config(const std::string& filename,
+                 const std::map<std::string, std::map<std::string, std::string>>& config) {
+    cv::FileStorage fs(filename, cv::FileStorage::WRITE);
+    if (!fs.isOpened())
+        throw std::runtime_error("Error: Can't open config file : " + filename);
+    for (auto device_it = config.begin(); device_it != config.end(); ++device_it) {
+        fs << device_it->first  << "{:";
+        for (auto param_it = device_it->second.begin(); param_it != device_it->second.end(); ++param_it)
+            fs << param_it->first << param_it->second;
+        fs << "}";
+    }
+    fs.release();
+}
+
+void load_config(const std::string& filename,
+                 std::map<std::string, std::map<std::string, std::string>>& config) {
+    cv::FileStorage fs(filename, cv::FileStorage::READ);
+    if (!fs.isOpened())
+        throw std::runtime_error("Error: Can't load config file : " + filename);
+    cv::FileNode root = fs.root();
+    for (auto it = root.begin(); it != root.end(); ++it) {
+        auto device = *it;
+        if (!device.isMap()) {
+            throw std::runtime_error("Error: Can't parse config file : " + filename);
+        }
+        for (auto iit = device.begin(); iit != device.end(); ++iit) {
+            auto item = *iit;
+            config[device.name()][item.name()] = item.string();
+        }
+    }
+}
+#endif
\ No newline at end of file
index b7ae051..70c2ce7 100644 (file)
 
 std::vector<std::string> parseDevices(const std::string& device_string);
 uint32_t deviceDefaultDeviceDurationInSeconds(const std::string& device);
-std::map<std::string, uint32_t> parseNStreamsValuePerDevice(const std::vector<std::string>& devices,
-                                                            const std::string& values_string);
+std::map<std::string, std::string> parseNStreamsValuePerDevice(const std::vector<std::string>& devices,
+                                                               const std::string& values_string);
+#ifdef USE_OPENCV
+void dump_config(const std::string& filename,
+                 const std::map<std::string, std::map<std::string, std::string>>& config);
+void load_config(const std::string& filename,
+                 std::map<std::string, std::map<std::string, std::string>>& config);
+#endif
\ No newline at end of file
index 79779cb..e656dcc 100644 (file)
@@ -85,30 +85,28 @@ int main(int argc, char *argv[]) {
         std::vector<std::string> availableDevices = ie.GetAvailableDevices();
 
         // --------------------------- 3. Query and print supported metrics and config keys--------------------
-        std::set<std::string> printedDevices;
 
         std::cout << "Available devices: " << std::endl;
         for (auto && device : availableDevices) {
-            std::string deviceFamilyName = device.substr(0, device.find_first_of('.'));
-            if (printedDevices.find(deviceFamilyName) == printedDevices.end())
-                printedDevices.insert(deviceFamilyName);
-            else
-                continue;
-
-            std::cout << "\tDevice: " << deviceFamilyName << std::endl;
+            std::cout << "\tDevice: " << device << std::endl;
 
             std::cout << "\tMetrics: " << std::endl;
-            std::vector<std::string> supportedMetrics = ie.GetMetric(deviceFamilyName, METRIC_KEY(SUPPORTED_METRICS));
+            std::vector<std::string> supportedMetrics = ie.GetMetric(device, METRIC_KEY(SUPPORTED_METRICS));
             for (auto && metricName : supportedMetrics) {
-                std::cout << "\t\t" << metricName << " : " << std::flush;
-                printParameterValue(ie.GetMetric(device, metricName));
+                if (metricName != METRIC_KEY(AVAILABLE_DEVICES)) {
+                    std::cout << "\t\t" << metricName << " : " << std::flush;
+                    printParameterValue(ie.GetMetric(device, metricName));
+                }
             }
 
-            std::cout << "\tDefault values for device configuration keys: " << std::endl;
-            std::vector<std::string> supportedConfigKeys = ie.GetMetric(deviceFamilyName, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
-            for (auto && configKey : supportedConfigKeys) {
-                std::cout << "\t\t" << configKey << " : " << std::flush;
-                printParameterValue(ie.GetConfig(deviceFamilyName, configKey));
+            if (std::find(supportedMetrics.begin(), supportedMetrics.end(),
+                METRIC_KEY(SUPPORTED_CONFIG_KEYS)) != supportedMetrics.end()) {
+                std::cout << "\tDefault values for device configuration keys: " << std::endl;
+                std::vector<std::string> supportedConfigKeys = ie.GetMetric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS));
+                for (auto && configKey : supportedConfigKeys) {
+                    std::cout << "\t\t" << configKey << " : " << std::flush;
+                    printParameterValue(ie.GetConfig(device, configKey));
+                }
             }
 
             std::cout << std::endl;
index 3030771..0a594eb 100644 (file)
@@ -643,12 +643,13 @@ int main(int argc, char *argv[]) {
         auto t0 = Time::now();
         ExecutableNetwork executableNet;
 
+        ie.SetConfig(genericPluginConfig, deviceStr);
         if (!FLAGS_m.empty()) {
             slog::info << "Loading model to the device" << slog::endl;
-            executableNet = ie.LoadNetwork(network, deviceStr, genericPluginConfig);
+            executableNet = ie.LoadNetwork(network, deviceStr);
         } else {
             slog::info << "Importing model to the device" << slog::endl;
-            executableNet = ie.ImportNetwork(FLAGS_rg.c_str(), deviceStr, genericPluginConfig);
+            executableNet = ie.ImportNetwork(FLAGS_rg.c_str(), deviceStr);
         }
 
         ms loadTime = std::chrono::duration_cast<ms>(Time::now() - t0);
diff --git a/inference-engine/scripts/run_code_checks.sh b/inference-engine/scripts/run_code_checks.sh
new file mode 100644 (file)
index 0000000..11689b7
--- /dev/null
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+CURRENT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+command -v realpath >/dev/null 2>&1 || { echo >&2 "cpplint require realpath executable but it's not installed.  Aborting."; exit 1; }
+SOURCE_DIR=$(realpath ${CURRENT_DIR}/..)
+REPORT_DIR="${SOURCE_DIR}/report"
+CPPLINT_REPORT_DIR="${REPORT_DIR}/cpplint"
+PROJECT_NAME="Inference Engine"
+
+function run_cpplint() {
+    echo "-> CppLint started..."
+    if [ -d ${CPPLINT_REPORT_DIR} ]; then
+        rm -Rf ${CPPLINT_REPORT_DIR}
+    fi
+
+    mkdir -p ${CPPLINT_REPORT_DIR}
+    python ${CURRENT_DIR}/cpplint.py --linelength=160 --counting=detailed --quiet --filter="
+        -build/header_guard,
+        -build/include,
+        -build/include_order,
+        -build/include_subdir,
+        -build/include_what_you_use,
+        -build/namespaces,
+        -build/c++11,
+        -whitespace/indent,
+        -whitespace/comments,
+        -whitespace/ending_newline,
+        -runtime/references,
+        -runtime/int,
+        -runtime/explicit,
+        -readability/todo,
+        -readability/fn_size
+    " $(find ${SOURCE_DIR} -name '*.h' -or -name '*.cc' -or -name '*.c' -or -name '*.cpp' -or -name '*.hpp' |
+        grep -v 'inference-engine/bin\|inference-engine/build\|inference-engine/report\|inference-engine/scripts\|inference-engine/temp\|inference-engine/tests_deprecated/\|gtest\|inference-engine/ie_bridges\|pugixml\|inference-engine/tools/vpu_perfcheck\|thirdparty/gflags\|thirdparty/ade\|thirdparty/fluid\|thirdparty/mkl-dnn\|thirdparty/movidius\|thirdparty/ocv\|thirdparty/plugixml\|thirdparty/std_lib\|thirdparty/clDNN/common\|thirdparty/clDNN/tutorial\|thirdparty/clDNN/utils' |
+        grep 'include\|src\|inference-engine/samples\|thirdparty/clDNN/kernel_selector\|thirdparty/clDNN/api\|thirdparty/clDNN/api_extension\|inference-engine/tests_' ) 2>&1 |
+        sed 's/"/\&quot;/g' >&1| sed 's/</\&lt;/g' >&1| sed 's/>/\&gt;/g' >&1| sed "s/'/\&apos;/g" >&1|
+        sed 's/\&/\&amp;/g' >&1| python ${CURRENT_DIR}/cpplint_to_cppcheckxml.py &> ${CPPLINT_REPORT_DIR}/cpplint-cppcheck-result.xml
+
+       # Generate html from it
+       ${CURRENT_DIR}/cppcheck-htmlreport.py --file=${CPPLINT_REPORT_DIR}/cpplint-cppcheck-result.xml --report-dir=${CPPLINT_REPORT_DIR} --source-dir=${SOURCE_DIR} --title=${PROJECT_NAME}
+
+       # Change Cppcheck things to cpplint
+       sed -i.bak 's/Cppcheck/cpplint/g' ${CPPLINT_REPORT_DIR}/index.html
+       sed -i.bak 's/a\ tool\ for\ static\ C\/C++\ code\ analysis/an\ open\ source\ lint\-like\ tool\ from\ Google/g' ${CPPLINT_REPORT_DIR}/index.html
+       sed -i.bak 's/http:\/\/cppcheck.sourceforge.net/http:\/\/google\-styleguide.googlecode.com\/svn\/trunk\/cpplint\/cpplint.py/g' ${CPPLINT_REPORT_DIR}/index.html
+       sed -i.bak 's/IRC: <a href=\"irc:\/\/irc.freenode.net\/cppcheck\">irc:\/\/irc.freenode.net\/cppcheck<\/a>/\ /g' ${CPPLINT_REPORT_DIR}/index.html
+
+    echo "-> CppLint finished..."
+}
+
+function run_cpp_check() {
+    echo "-> Cppcheck started..."
+    CPPCHECK_REPORT_DIR="${REPORT_DIR}/cppcheck"
+    if [ -d ${CPPCHECK_REPORT_DIR} ]; then
+        rm -Rf ${CPPCHECK_REPORT_DIR}
+    fi
+
+    mkdir -p ${CPPCHECK_REPORT_DIR}
+
+       # Generate cppcheck xml
+       cppcheck -v --enable=all --suppress=missingIncludeSystem --std=c++11 ${SOURCE_DIR} -i${SOURCE_DIR}/thirdparty -i${SOURCE_DIR}/tests/libs -i${SOURCE_DIR}/temp -i${SOURCE_DIR}/build \
+         -i${SOURCE_DIR}/bin -i${SOURCE_DIR}/report -I${SOURCE_DIR}/include -I${SOURCE_DIR}/src -I${SOURCE_DIR}/thirdparty/pugixml/src -I${SOURCE_DIR}/thirdparty/gflags/src -I${SOURCE_DIR}/samples/scoring_agent/HTTPClient -I${SOURCE_DIR}/src/inference_engine --xml-version=2 2> ${CPPCHECK_REPORT_DIR}/cppcheck-only-result.xml
+
+       # Generate html from it
+       python ${CURRENT_DIR}/cppcheck-htmlreport.py\
+               --file=${CPPCHECK_REPORT_DIR}/cppcheck-only-result.xml\
+               --report-dir=${CPPCHECK_REPORT_DIR}\
+               --source-dir=${SOURCE_DIR}\
+               --title=${PROJECT_NAME}
+    echo "-> Cppcheck finished..."
+}
+
+if [ ! -d ${REPORT_DIR} ]; then
+    mkdir -p ${REPORT_DIR}
+fi
+
+run_cpplint
+
+out_cpp_lint=`cat ${CPPLINT_REPORT_DIR}/cpplint-cppcheck-result.xml`
+if [[ ${out_cpp_lint} == *"error"* ]]; then
+    exit 1
+fi
+#run_cpp_check
index c620939..5774f02 100644 (file)
@@ -4,6 +4,8 @@
 
 add_subdirectory(preprocessing)
 
+add_subdirectory(ir_readers)
+
 add_subdirectory(legacy_api)
 
 if(ENABLE_MKL_DNN)
index c63f204..25bd481 100644 (file)
@@ -269,5 +269,6 @@ void Config::adjustKeyMapValues() {
 
     key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams);
     key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id;
+    key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = "";
 }
 }  // namespace CLDNNPlugin
index 6d6fd9e..c18dc0f 100644 (file)
@@ -2704,7 +2704,7 @@ void Program::CreatePoolingPrimitive(cldnn::topology& topology, InferenceEngine:
         } else {
             size = (cldnn::tensor) cldnn::spatial(TensorValue(poolLayer->_kernel[X_AXIS]), TensorValue(poolLayer->_kernel[Y_AXIS]));
             stride = (cldnn::tensor) cldnn::spatial(TensorValue(poolLayer->_stride[X_AXIS]), TensorValue(poolLayer->_stride[Y_AXIS]));
-            input_offset = { 0, 0, -TensorValue(allPads.begin[X_AXIS]), -TensorValue(allPads.begin[Y_AXIS]) };
+            input_offset = { 0, 0, -TensorValue(allPads.begin[X_AXIS]), -TensorValue(allPads.begin[Y_AXIS]), 0 };
         }
 
         auto dt = DataTypeFromPrecision(poolLayer->outData[0]->getPrecision());
index 1ab4e64..c3e5244 100644 (file)
@@ -8,6 +8,7 @@
 #include <set>
 #include <string>
 #include <algorithm>
+#include <map>
 
 #if defined __INTEL_COMPILER || defined _MSC_VER
 #include <malloc.h>
@@ -28,6 +29,7 @@
 #include "gna2_model_debug_log.hpp"
 #else
 #include <gna-api-types-xnn.h>
+#include <map>
 
 #endif
 
@@ -373,6 +375,13 @@ float GNAPluginNS::backend::AMIntelDNN::OutputScaleFactor(intel_dnn_component_t
     return comp.output_scale_factor;
 }
 
+struct InputEndPoint {
+    int idx = 0;
+    size_t size = 0;
+    size_t num_bytes_per_output = 1;
+    InputEndPoint() = default;
+    InputEndPoint(int nidx, size_t sz, size_t esize) : idx(nidx), size(sz), num_bytes_per_output(esize) {}
+};
 
 void GNAPluginNS::backend::AMIntelDNN::WriteGraphWizModel(const char *filename) {
     auto & components = component;
@@ -414,11 +423,21 @@ void GNAPluginNS::backend::AMIntelDNN::WriteGraphWizModel(const char *filename)
         return ptra >= ptrb  && ptra < reinterpret_cast<char*>(ptrb) + bsize;
     };
 
+    auto startPtr = [](void* ptr, size_t size) {
+        return reinterpret_cast<int8_t*>(ptr);
+    };
+    auto endPtr = [](void* ptr, size_t size) {
+        return reinterpret_cast<int8_t*>(ptr) + size;
+    };
+    auto sizeofTensor = [](void* ptr, size_t size) {
+        return size;
+    };
+
     std::fstream graph(filename, std::ios::out);
     graph << "strict digraph {";
     std::set<void*> weights;
     std::set<void*> biases;
-    std::set<void*> outputs;
+    std::map<void*, InputEndPoint> outputs;
     std::set<std::string> layersNames;
 
     auto generate_layer_name = [&](int k) {
@@ -565,11 +584,25 @@ void GNAPluginNS::backend::AMIntelDNN::WriteGraphWizModel(const char *filename)
             }
         }
         if (!inputConnected) {
-            // drawing tmp connection
-            outputs.insert(components[k].ptr_inputs);
-            auto tidx = std::distance(outputs.begin(), outputs.find(components[k].ptr_inputs));
-            graph << tidx << " -> " << l
-                  << " [label=\"FROM_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
+            // searching for TMP connection
+            size_t tidx = -1;
+            for (auto && en : outputs) {
+                if (intersected(en.first, en.second.size, INPUTS(k))) {
+                    tidx = en.second.idx;
+                    auto  updated_ptr  = std::min(startPtr(en.first, en.second.size), startPtr(INPUTS(k)));
+                    auto  updated_size = std::max(endPtr(en.first, en.second.size), endPtr(INPUTS(k))) - updated_ptr;
+                    outputs.erase(en.first);
+                    outputs[updated_ptr] = InputEndPoint(tidx, updated_size, components[k].num_bytes_per_input);
+                    break;
+                }
+            }
+
+            if (tidx == -1) {
+                outputs[components[k].ptr_inputs] = InputEndPoint(outputs.size(), sizeofTensor(INPUTS(k)), components[k].num_bytes_per_input);
+            }
+            tidx = outputs[components[k].ptr_inputs].idx;
+            graph << "parameter_" << tidx << " -> " << l
+                  << " [fontcolor=darkgreen,color=orange, style=dashed];";
         }
     }
 
@@ -578,13 +611,25 @@ void GNAPluginNS::backend::AMIntelDNN::WriteGraphWizModel(const char *filename)
 
         int tidx = 0;
         for (auto tmpOutPtrs : outputs) {
-            if (components[k].ptr_outputs == tmpOutPtrs) {
+            if (components[k].ptr_outputs == tmpOutPtrs.first) {
                 graph << l << " -> " << tidx << " [label=\"TO_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
             }
             tidx++;
         }
     }
 
+    // writing inputs info
+    for (auto && en : outputs) {
+        std::string l = "parameter_" + std::to_string(en.second.idx);
+        graph <<  l << " [shape=box, style=filled, fillcolor=\"#85C1E9\"";
+        graph << ", label=<<TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">\n"
+                 "  <TR><TD  colspan=\"2\">" <<  l << "</TD></TR>\n";
+        graph << "  <TR><TD> dims</TD><TD>" << 1 << "x" << en.second.size / en.second.num_bytes_per_output << "</TD></TR>\n";
+        graph << "  <TR><TD> obit</TD><TD>" << en.second.num_bytes_per_output << "</TD></TR>\n";
+        graph << "  <TR><TD> ptr</TD><TD>" <<  en.first << "</TD></TR>\n";
+        graph << "</TABLE>>];\n";
+    }
+
     graph << "}";
 }
 
index d58a96a..90f01ff 100644 (file)
@@ -21,20 +21,28 @@ class GNAExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafe
     std::shared_ptr<GNAPlugin> plg;
 
  public:
-    GNAExecutableNetwork(const std::string &aotFileName, const std::map<std::string, std::string> &config) :
-        plg(std::make_shared<GNAPlugin>(config)) {
+    GNAExecutableNetwork(const std::string &aotFileName, std::shared_ptr<GNAPlugin> plg)
+        : plg(plg) {
         plg->ImportNetwork(aotFileName);
         _networkInputs  = plg->GetInputs();
         _networkOutputs = plg->GetOutputs();
     }
 
-    GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config)
-        : plg(std::make_shared<GNAPlugin>(config)) {
+    GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, std::shared_ptr<GNAPlugin> plg)
+        : plg(plg) {
         InferenceEngine::NetPass::ConvertPrecision(network, InferenceEngine::Precision::I64, InferenceEngine::Precision::I32);
         InferenceEngine::NetPass::ConvertPrecision(network, InferenceEngine::Precision::U64, InferenceEngine::Precision::I32);
         plg->LoadNetwork(network);
     }
 
+    GNAExecutableNetwork(const std::string &aotFileName, const std::map<std::string, std::string> &config)
+        : GNAExecutableNetwork(aotFileName, std::make_shared<GNAPlugin>(config)) {
+    }
+
+    GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config)
+        : GNAExecutableNetwork(network, std::make_shared<GNAPlugin>(config)) {
+    }
+
     InferenceEngine::AsyncInferRequestInternal::Ptr
         CreateAsyncInferRequestImpl(InferenceEngine::InputsDataMap networkInputs,
                                     InferenceEngine::OutputsDataMap networkOutputs) override {
@@ -58,5 +66,18 @@ class GNAExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafe
     void ExportImpl(std::ostream&) override {
         THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
     }
+
+    void GetConfig(const std::string &name,
+                   InferenceEngine::Parameter &result,
+                   InferenceEngine::ResponseDesc* /*resp*/) const override {
+        result = plg->GetConfig(name, {});
+    }
+
+    void GetMetric(const std::string& name,
+                   InferenceEngine::Parameter& result,
+                   InferenceEngine::ResponseDesc* /* resp */) const override {
+        result = plg->GetMetric(name, {});
+    }
 };
+
 }  // namespace GNAPluginNS
index a0a105e..a5b352c 100644 (file)
@@ -34,6 +34,7 @@
 #include "layers/gna_concat_layer.hpp"
 #include "layers/gna_crop_layer.hpp"
 #include "round_float_define.hpp"
+#include "gna_plugin_policy.hpp"
 
 using namespace InferenceEngine;
 using namespace std;
@@ -58,6 +59,10 @@ void GNAGraphCompiler::setGNAFlagsPtr(std::shared_ptr<GNAPluginNS::GNAFlags> gna
     this->gnaFlags = std::move(gnaFlagsPtr);
 }
 
+void GNAGraphCompiler::setPolicy(GNAPluginNS::Policy policyToSet) {
+    this->policy = policyToSet;
+}
+
 intel_dnn_component_t * GNAGraphCompiler::find_first_unused_input(InferenceEngine::CNNLayerPtr current) {
     if (current->insData.empty())
         return nullptr;
@@ -987,13 +992,57 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
     auto outputs = *layer->outData.begin();
     auto inputs = layer->insData.begin()->lock();
 
-    // auto offset = filterLayer->GetParamAsInt("output_offset");
-
     uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
     uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
     uint32_t num_rows_in = filterLayer->_weights->size() / num_rows_out;
     uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
 
+    auto numRowsPadded = filterLayer->GetParamAsInt("num_rows_padded");
+    // number of rows we handled by inserting copy layer
+    uint32_t num_rows_copied = 0;
+    // in case of left alignment succeed, but due to number of elements not multiple of 8 we need to insert align_filter
+    // we are improving it by inserting copy layer of size that covers most of elements - remained max of 32x31 affine filter
+    if (policy.ConcatAlignmentPolicy == Policy::ConcatAlignment::FAST &&  0 == numRowsPadded && ALIGN(num_rows_in, 32) > 32) {
+        // can we use copy at all
+        num_rows_copied = ALIGN(num_rows_in, 32) - 32;
+
+        auto orientation = kDnnInterleavedOrientation;
+
+        auto& copyComponent = dnnComponents.addComponent(layer->name + "_synthetic_copy", "copy");
+
+        dnn->InitCopyComponent(copyComponent,
+                               orientation,
+                               num_rows_copied,
+                               num_columns_in,
+                               num_rows_copied,
+                               num_columns_in,
+                               inputs->getPrecision().size(),
+                               inputs->getPrecision().size(),
+                               quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                               num_rows_copied,
+                               num_columns_in,
+                               ptr_inputs,
+                               ptr_outputs);
+
+
+        size_t num_data_bytes_in = num_rows_copied * num_rows_copied * num_columns_in
+            * inputs->getPrecision().size();
+        // need to reserve full tensor so using original size with assumption of identity activation attached to filter lateron
+        size_t num_data_bytes_out = num_rows_out * num_columns_in * inputs->getPrecision().size();
+
+        connectInput(layer, ptr_inputs, num_data_bytes_in);
+        auto isNonFunctional = [](CNNLayerPtr l) {
+            return LayerInfo(l).isNonFunctional();
+        };
+        auto identity = CNNNetGetNextLayerSkipCertain(layer, 0, 0, isNonFunctional);
+        connectOutput(identity.first, ptr_outputs, num_data_bytes_out);
+
+        num_rows_in  -= num_rows_copied;
+        num_rows_out -= num_rows_copied;
+    }
+    filterLayer->params["rows_copied_offset"] = std::to_string(num_rows_copied * inputs->getPrecision().size());
+
+
     auto biasPrecision = filterLayer->_biases ? filterLayer->_biases->getTensorDesc().getPrecision() : outputs->getPrecision();
     auto& currentComponent = dnnComponents.addComponent(layer->name, "affine");
 
@@ -1013,35 +1062,36 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
         ptr_biases,
         false);
 
-    size_t num_data_bytes_out =
-        InferenceEngine::details::product(
-            begin(outputs->getDims()), end(outputs->getDims())) * 4;
-
+    size_t num_data_bytes_out = num_rows_out * num_columns_in * outputs->getPrecision().size();
     size_t num_data_bytes_in = num_columns_in *
         ALIGN(num_rows_in, 8) * inputs->getPrecision().size();
 
-    connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
+    connectInput(layer, ptr_inputs, num_data_bytes_in, num_rows_copied * inputs->getPrecision().size(), 0);
     connectOutput(layer, ptr_outputs, num_data_bytes_out);
 
-    if (num_padding == 0) {
-        gnamem->readonly().push_ptr(ptr_weights,
-            filterLayer->_weights->cbuffer().as<const void*>(),
-            filterLayer->_weights->byteSize(),
-            64);
-    } else {
+    {
+        auto weightsElementSize = filterLayer->_weights->getTensorDesc().getPrecision().size();
         auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
         auto paddedWeights = elementsIn * num_rows_out;
-        auto paddedWeightsSize = paddedWeights * filterLayer->precision.size();
+        auto paddedWeightsSize = paddedWeights * weightsElementSize;
+
+        // TODO: this can be improved to not generate unneeded weights at all
+
+        size_t weights_stride =  (num_rows_in + num_rows_copied) * weightsElementSize;
+        size_t weights_offset = weights_stride * num_rows_copied +  num_rows_copied * weightsElementSize;
 
         gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void* data, size_t size) {
-            size_t offset = 0;
-            for (int i = 0; i < num_rows_out && size >= offset; i++) {
-                ie_memcpy(reinterpret_cast<uint8_t*>(data) + offset, size - offset,
-                    filterLayer->_weights->cbuffer().as<const uint8_t*>() + num_rows_in * i * filterLayer->precision.size(),
-                    num_rows_in* filterLayer->precision.size());
-                offset += (num_rows_in + num_padding) * filterLayer->precision.size();
+            size_t roffset = weights_offset;
+            size_t woffset = 0;
+            for (int i = 0; i < num_rows_out && size >= woffset; i++) {
+                ie_memcpy(reinterpret_cast<uint8_t*>(data) + woffset,
+                          size - woffset,
+                          filterLayer->_weights->cbuffer().as<const uint8_t*>() + roffset,
+                          num_rows_in * weightsElementSize);
+                roffset += weights_stride;
+                woffset += elementsIn * weightsElementSize;
             }
-            }, 64);
+         }, 64);
     }
 
     if (filterLayer->_biases) {
@@ -1189,11 +1239,18 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
         num_rows = FROM_IR_DIM(inputs, 1);
     }
 
-    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->getDims()), end(outputs->getDims()))
-        * outputs->getPrecision().size();
+    // TODO: solve this by layer level transformations
+    auto concatAlignFilter = CNNNetPrevLayer(layer, 0);
+    if (LayerInfo(concatAlignFilter).isConcatAlignFilter()) {
+        auto rowsCopiedOffset = concatAlignFilter->GetParamAsInt("rows_copied_offset");
+        if (rowsCopiedOffset != 0) {
+            num_rows -= rowsCopiedOffset / outputs->getPrecision().size();
+            layer->params["output_offset"] = std::to_string(rowsCopiedOffset);
+        }
+    }
 
-    size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs->getDims()), end(inputs->getDims()))
-        * inputs->getPrecision().size();
+    size_t num_data_bytes_out = num_columns * num_rows * outputs->getPrecision().size();
+    size_t num_data_bytes_in = num_columns * num_rows * inputs->getPrecision().size();
 
     static InferenceEngine::details::caseless_unordered_map<std::string, DnnActivationType> supportedActivations = {
         {"sigmoid", kActSigmoid},
@@ -1626,7 +1683,7 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
 
             if (it != splitLayerInfoItem.splitOutputLayers.end()) {
                 gnalog()  << "Connecting " << splitName << " input \n";
-                auto res = connectInput(splittingLayer, ptr, splitLayerInfoItem.reserved_size, it->offset, 0);
+                auto res = connectInput(splittingLayer, ptr, splitLayerInfoItem.reserved_size, it->offset + offset, 0);
                 gnalog()  << "Connected \n";
                 return res;
             }
index 87f80ce..ccaf60a 100644 (file)
@@ -26,6 +26,7 @@
 #include "backend/am_intel_dnn.hpp"
 #include "gna_device.hpp"
 #include "gna_data_types.hpp"
+#include "gna_plugin_policy.hpp"
 
 namespace GNAPluginNS {
 class GNAGraphCompiler {
@@ -34,6 +35,7 @@ private:
     std::shared_ptr<GNAPluginNS::gna_memory_type> gnamem;
     std::shared_ptr<GNAPluginNS::InputDesc> inputDesc;
     std::shared_ptr<GNAPluginNS::GNAFlags> gnaFlags;
+    Policy policy;
 
     // layers with extra storage for connections and additional
     // non trivial processing
@@ -53,6 +55,7 @@ public:
     void setDNNPtr(std::shared_ptr<GNAPluginNS::backend::AMIntelDNN> dnnPtr);
     void setInputDescPtr(std::shared_ptr<GNAPluginNS::InputDesc> inputDescPtr);
     void setGNAFlagsPtr(std::shared_ptr<GNAPluginNS::GNAFlags> gnaFlagsPtr);
+    void setPolicy(GNAPluginNS::Policy policy);
 
     void fillMemoryConnections(std::unordered_map<std::string,
             std::vector<InferenceEngine::CNNLayerPtr>> &memoryPairs);
index 1bf2dd3..1099911 100644 (file)
@@ -237,7 +237,6 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
     auto convert_to_serial = [getOffsetFromBase](const GNAModelSerial::RuntimeEndPoint& ep) {
         ModelHeader::EndPoint out;
         out.elements_count = ep.elements_count;
-        out.element_size = ep.element_size;
         out.descriptor_offset = offsetFromBase(ep.descriptor_ptr);
         out.scaleFactor = ep.scaleFactor;
         return out;
index 8010c65..23f0cdc 100644 (file)
@@ -22,6 +22,7 @@
 #include <graph_tools.hpp>
 #include <debug.h>
 #include <gna/gna_config.hpp>
+#include "gna_plugin_config.hpp"
 #include <ie_util_internal.hpp>
 #include "gna_plugin.hpp"
 #include "optimizer/gna_pass_manager.hpp"
@@ -302,6 +303,7 @@ void GNAPlugin::ImportFrames(
 
 GNAPlugin::GNAPlugin() {
     Init();
+    UpdateFieldsFromConfig();
 }
 
 GNAPlugin::GNAPlugin(const std::map<std::string, std::string>& configMap) {
@@ -321,13 +323,13 @@ void GNAPlugin::Init() {
 
 void GNAPlugin::InitGNADevice() {
 #if GNA_LIB_VER == 1
-    gnadevice = std::make_shared<GNADeviceHelper>(gna_proc_type,
+    gnadevice = std::make_shared<GNADeviceHelper>(config.gna_proc_type,
                                         gnaFlags->gna_lib_async_threads_num,
                                         gnaFlags->gna_openmp_multithreading,
                                         gnaFlags->performance_counting);
 #else
-    gnadevice = std::make_shared<GNADeviceHelper>(pluginGna2AccMode,
-                pluginGna2DeviceConsistent,
+    gnadevice = std::make_shared<GNADeviceHelper>(config.pluginGna2AccMode,
+                                                  config.pluginGna2DeviceConsistent,
                 gnaFlags->gna_lib_async_threads_num,
                 gnaFlags->gna_openmp_multithreading,
                 gnaFlags->performance_counting);
@@ -387,7 +389,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
         run_passes(newNet, true);
         run_passes(newNet, false);
     } else {
-        switch (gnaPrecision) {
+        switch (config.gnaPrecision) {
             case Precision::I16:
                 ModelQuantizer<QuantI16> q16;
                 newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
@@ -421,6 +423,9 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
 
     auto sortedNet = CNNNetSortTopologicallyEx(*newNet, make_fuzed_order);
 
+    // passing policy to compiler
+    graphCompiler.setPolicy(policy);
+
     if (sortedNet.empty()) {
         THROW_GNA_EXCEPTION << "Sorted network is empty";
     }
@@ -534,10 +539,33 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
 
             gnalog() << "[UFS] from : "<< outPort.first <<" reached: " << layer->name << "\n";
 
+            // probing gna_primitives
             if (irLayerAvatar != graphCompiler.dnnComponents.components.end()) {
                 initOutput(portId, irLayerAvatar->second, layer);
                 stopSearching = true;
             }
+
+            // probing concatInfo
+            if (!stopSearching && LayerInfo(layer).isConcat()) {
+                auto concatConnection  = graphCompiler.concat_connection.find(layer->name);
+                if (concatConnection != graphCompiler.concat_connection.end()) {
+                    //initOutput(portId, irLayerAvatar->second, layer);
+
+                    auto &desc = outputsDesc[portId];
+                    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+                    desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
+                    // TODO: what is orientation for concat
+                    desc.orientation = kDnnInterleavedOrientation;
+                    desc.num_bytes_per_element = layer->outData.front()->getPrecision().size();
+                    desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+                    desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element;
+
+                    // binding ptr for first infer request - then others will be setup during relocation
+                    gnamem->bind_ptr(&desc.ptrs.front(), &concatConnection->second.gna_ptr);
+                    stopSearching = true;
+                }
+            }
         }, true, [&stopSearching](InferenceEngine::CNNLayer* from) {
             return make_upstream_order(!stopSearching ? from : nullptr);
         });
@@ -722,20 +750,20 @@ void GNAPlugin::createRequestConfigsForGnaModels() {
 void GNAPlugin::DumpXNNToFile() const {
     // TODO: output  precision as well as pointer might be incorrect, LSTM for sure
     // gna looks automatically set layer 0 as output and adjust it's pointer / precision/ size respectively
-    if (dumpXNNPath.empty()) {
+    if (config.dumpXNNPath.empty()) {
         return;
     }
 
-    if (dumpXNNGeneration != "GNA1" &&
-        dumpXNNGeneration != "GNA3" &&
-        !dumpXNNGeneration.empty()) {
-        THROW_GNA_EXCEPTION << "Wrong GNA generation for embedded model dump: " << dumpXNNGeneration;
+    if (config.dumpXNNGeneration != "GNA1" &&
+        config.dumpXNNGeneration != "GNA3" &&
+        !config.dumpXNNGeneration.empty()) {
+        THROW_GNA_EXCEPTION << "Wrong GNA generation for embedded model dump: " << config.dumpXNNGeneration;
     }
 
     if (!gnadevice) {
         THROW_GNA_EXCEPTION << "Cannot generate XNNDump for float network";
     }
-    std::ofstream dumpStream(dumpXNNPath, std::ios::out | std::ios::binary);
+    std::ofstream dumpStream(config.dumpXNNPath, std::ios::out | std::ios::binary);
 #if GNA_LIB_VER == 1
     auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices);
     dump.header.rw_region_size = gnamem->getRWBytes();
@@ -745,7 +773,7 @@ void GNAPlugin::DumpXNNToFile() const {
     dumpStream.write(reinterpret_cast<char*>(dump.model.get()), dump.header.model_size);
 #else
     auto const modelId = gnadevice->createModel(std::get<0>(gnaModels.front())->obj);
-    if (dumpXNNGeneration != "GNA3") {
+    if (config.dumpXNNGeneration != "GNA3") {
         auto dump = gnadevice->dumpXnn(modelId);
         dump.header.RwRegionSize = gnamem->getRWBytes();
         dump.header.InputScalingFactor = inputsDesc->inputScaleFactors.front();
@@ -1204,228 +1232,14 @@ void GNAPlugin::GetPerformanceCounts(std::map<std::string, InferenceEngine::Infe
 
 void GNAPlugin::AddExtension(InferenceEngine::IExtensionPtr extension) {}
 
-void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config) {
-    Init();
-    auto supportedConfigOptions = supportedConfigKeys();
-
-    for (auto& item : config) {
-        auto keys = std::find_if(supportedConfigOptions.begin(), supportedConfigOptions.end(), [&item](const std::string& supportedConfigOption) {
-            return item.first == supportedConfigOption ||
-                   item.first.find(GNA_CONFIG_KEY(SCALE_FACTOR)) == 0;
-        });
-        if (keys == supportedConfigOptions.end()) {
-            THROW_GNA_EXCEPTION << as_status << NOT_FOUND << "Incorrect GNA Plugin config. Key " << item.first << " not supported";
-        }
-    }
-
-    // holds actual value of a found key
-    std::string key;
-    std::string value;
-    auto if_set = [&](const std::string& keyInput, const std::function<void()> & handler) {
-        auto keyInMap = config.find(keyInput);
-        if (keyInMap != config.end()) {
-            value = keyInMap->second;
-            handler();
-        }
-    };
-
-    auto if_start = [&](const std::string& keyInput, const std::function<void()> & handler) {
-        for (auto && c : config) {
-            if (c.first.find(keyInput) == 0) {
-                if (c.first.size() > keyInput.size() + 1) {
-                    key = c.first.substr(keyInput.size() + 1);
-                    value = c.second;
-                    handler();
-                }
-            }
-        }
-    };
-
-    auto fp32eq = [](float p1, float p2) -> bool {
-        return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
-    };
-
-    auto & log = gnalog();
-
-    if_start(GNA_CONFIG_KEY(SCALE_FACTOR), [&, this] {
-        uint64_t scaleForInput = std::stoul(key, NULL, 10);
-        if (scaleForInput > 10) {
-            THROW_GNA_EXCEPTION << "input scale factor with index(" << key << ") unsupported";
-        }
-        auto scaleFactor = std::stod(value);
-        if (fp32eq(scaleFactor, 0.0f)) {
-            THROW_GNA_EXCEPTION << "input scale factor of 0.0f not supported";
-        }
-        // not appeared scale factors are to be 1.0f
-        if (inputsDesc->inputScaleFactors.size() <= scaleForInput) {
-            inputsDesc->inputScaleFactors.resize(scaleForInput + 1, 1.f);
-        }
-        inputsDesc->inputScaleFactors[scaleForInput] = InferenceEngine::CNNLayer::ie_parse_float(value);
-    });
-
-    if (inputsDesc->inputScaleFactors.empty()) {
-        if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] {
-            auto scaleFactor = InferenceEngine::CNNLayer::ie_parse_float(value);
-            if (fp32eq(scaleFactor, 0.0f)) {
-                THROW_GNA_EXCEPTION << "input scale factor of 0.0f not supported";
-            }
-            inputsDesc->inputScaleFactors.push_back(scaleFactor);
-        });
-    }
-
-    if (inputsDesc->inputScaleFactors.empty()) {
-        inputsDesc->inputScaleFactors.push_back(1.f);
-    }
-
-    if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] {
-        dumpXNNPath = value;
-    });
-
-    if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION), [&] {
-        dumpXNNGeneration = value;
-    });
-
-    if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] {
-#if GNA_LIB_VER == 1
-        static caseless_unordered_map <std::string, uint32_t> supported_values = {
-                {GNAConfigParams::GNA_AUTO, GNA_AUTO},
-                {GNAConfigParams::GNA_HW, GNA_HARDWARE},
-                {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
-                {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
-        };
-        static std::vector <std::string> supported_values_on_gna2 = {
-            GNAConfigParams::GNA_GEN,
-            GNAConfigParams::GNA_GEN_EXACT,
-            GNAConfigParams::GNA_SSE,
-            GNAConfigParams::GNA_SSE_EXACT,
-            GNAConfigParams::GNA_AVX1,
-            GNAConfigParams::GNA_AVX1_EXACT,
-            GNAConfigParams::GNA_AVX2,
-            GNAConfigParams::GNA_AVX2_EXACT
-        };
-#else
-        static caseless_unordered_map <std::string, std::pair<Gna2AccelerationMode, Gna2DeviceVersion> > supported_values = {
-            {GNAConfigParams::GNA_AUTO, {Gna2AccelerationModeAuto, Gna2DeviceVersionSoftwareEmulation}},
-            {GNAConfigParams::GNA_HW, {Gna2AccelerationModeHardware, Gna2DeviceVersionSoftwareEmulation}},
-            {GNAConfigParams::GNA_SW, {Gna2AccelerationModeSoftware, Gna2DeviceVersionSoftwareEmulation}},
-            {GNAConfigParams::GNA_SW_EXACT, {Gna2AccelerationModeSoftware, Gna2DeviceVersion1_0}},
-            {GNAConfigParams::GNA_GEN, {Gna2AccelerationModeGeneric, Gna2DeviceVersionSoftwareEmulation}},
-            {GNAConfigParams::GNA_GEN_EXACT, {Gna2AccelerationModeGeneric, Gna2DeviceVersion1_0}},
-            {GNAConfigParams::GNA_SSE, {Gna2AccelerationModeSse4x2, Gna2DeviceVersionSoftwareEmulation}},
-            {GNAConfigParams::GNA_SSE_EXACT, {Gna2AccelerationModeSse4x2, Gna2DeviceVersion1_0}},
-            {GNAConfigParams::GNA_AVX1, {Gna2AccelerationModeAvx1, Gna2DeviceVersionSoftwareEmulation}},
-            {GNAConfigParams::GNA_AVX1_EXACT, {Gna2AccelerationModeAvx1, Gna2DeviceVersion1_0}},
-            {GNAConfigParams::GNA_AVX2, {Gna2AccelerationModeAvx2, Gna2DeviceVersionSoftwareEmulation}},
-            {GNAConfigParams::GNA_AVX2_EXACT, {Gna2AccelerationModeAvx2, Gna2DeviceVersion1_0}},
-        };
-#endif
-        auto procType = supported_values.find(value);
-        if (procType == supported_values.end()) {
-            if (value == GNA_CONFIG_VALUE(SW_FP32)) {
-                gnaFlags->sw_fp32 = true;
-            } else {
-#if GNA_LIB_VER == 1
-                auto is_gna2_mode = std::find(
-                        supported_values_on_gna2.begin(),
-                        supported_values_on_gna2.end(),
-                        value);
-                if (is_gna2_mode != supported_values_on_gna2.end()) {
-                    THROW_GNA_EXCEPTION << "This GNA device mode require GNA2 library: " << value;
-                }
-#endif
-                THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
-            }
-        } else {
-#if GNA_LIB_VER == 1
-            gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
-#else
-            pluginGna2AccMode = procType->second.first;
-            pluginGna2DeviceConsistent = procType->second.second;
-#endif
-        }
-    });
-
-    if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] {
-        if (value == PluginConfigParams::YES) {
-            gnaFlags->compact_mode = true;
-        } else if (value == PluginConfigParams::NO) {
-            gnaFlags->compact_mode = false;
-        } else {
-            log << "GNA compact mode should be YES/NO, but not" << value;
-            THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value;
-        }
-    });
-
-    if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] {
-        if (value == PluginConfigParams::YES) {
-            gnaFlags->exclusive_async_requests  = true;
-        } else if (value == PluginConfigParams::NO) {
-            gnaFlags->exclusive_async_requests  = false;
-        } else {
-            log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
-            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
-        }
-    });
-
-    if_set(GNA_CONFIG_KEY(PRECISION), [&] {
-        auto precision = Precision::FromStr(value);
-        if (precision != Precision::I8 && precision != Precision::I16) {
-            log << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
-            THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
-        }
-        gnaPrecision = precision;
-    });
-
-    if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] {
-        if (value == PluginConfigParams::YES) {
-            gnaFlags->uniformPwlDesign = true;
-        } else if (value == PluginConfigParams::NO) {
-            gnaFlags->uniformPwlDesign = false;
-        } else {
-            log << "GNA pwl uniform algorithm parameter "
-                << "should be equal to YES/NO, but not" << value;
-            THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
-                                << "should be equal to YES/NO, but not" << value;
-        }
-    });
-
-    if_set(CONFIG_KEY(PERF_COUNT), [&] {
-        if (value == PluginConfigParams::YES) {
-            gnaFlags->performance_counting = true;
-        } else if (value == PluginConfigParams::NO) {
-            gnaFlags->performance_counting = false;
-        } else {
-            log << "GNA performance counter enabling parameter "
-                << "should be equal to YES/NO, but not" << value;
-            THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
-                                << "should be equal to YES/NO, but not" << value;
-        }
-    });
-
-    if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] {
-        uint64_t lib_threads = std::stoul(value, NULL, 10);
-        if (lib_threads == 0 || lib_threads > std::numeric_limits<uint8_t>::max()/2-1) {
-            log << "Unsupported accelerator lib number of threads: " << value << ", should be greateer than 0 and less than 127";
-            THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
-                                << ", should be greateer than 0 and less than 127";
-        }
-        gnaFlags->gna_lib_async_threads_num = lib_threads;
-    });
-
-    if_set(CONFIG_KEY(SINGLE_THREAD), [&] {
-        if (value == PluginConfigParams::YES) {
-            gnaFlags->gna_openmp_multithreading  = false;
-        } else if (value == PluginConfigParams::NO) {
-            gnaFlags->gna_openmp_multithreading  = true;
-        } else {
-            log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
-            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
-        }
-    });
+void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config_map) {
+    config.UpdateFromMap(config_map);
+    UpdateFieldsFromConfig();
+}
 
-    if (gnaFlags->sw_fp32 && gnaFlags->gna_lib_async_threads_num > 1) {
-        THROW_GNA_EXCEPTION << "GNA plugin not support async mode on GNA_SW_FP32!";
-    }
+void GNAPlugin::UpdateFieldsFromConfig() {
+    inputsDesc->inputScaleFactors = config.inputScaleFactors;
+    *gnaFlags = config.gnaFlags;
 }
 
 void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
index eb9f37b..10ccc7b 100644 (file)
@@ -22,6 +22,7 @@
 #include "gna_graph_compiler.hpp"
 #include "gna_plugin_policy.hpp"
 #include "gna_plugin_log.hpp"
+#include "gna_plugin_config.hpp"
 
 #if GNA_LIB_VER == 2
 #include <gna2-model-api.h>
@@ -32,6 +33,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::
  protected:
     std::string _pluginName = "GNA";
 
+    Config config;
     std::shared_ptr<GNAPluginNS::backend::AMIntelDNN> dnn;
     std::shared_ptr<GNAPluginNS::GNAFlags> gnaFlags;
     std::shared_ptr<GNAPluginNS::gna_memory_type> gnamem;
@@ -63,20 +65,12 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::
     // index matches iterating order of cnnnetwork outputs info
     std::vector<GNAPluginNS::OutputDesc> outputsDesc = std::vector<OutputDesc>();
 
-    // precision of GNA hardware model
-    InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
-
     intel_dnn_number_type_t output_type = kDnnInt;
 
     GNAPluginNS::Policy policy;
-    std::string dumpXNNPath;
-    std::string dumpXNNGeneration;
-#if GNA_LIB_VER == 1
-    intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
-#else
-    Gna2AccelerationMode pluginGna2AccMode = Gna2AccelerationModeSoftware;
-Gna2DeviceVersion pluginGna2DeviceConsistent = Gna2DeviceVersion1_0;
-void createRequestConfigsForGnaModels();
+
+#if GNA_LIB_VER == 2
+    void createRequestConfigsForGnaModels();
 #endif
 
     std::shared_ptr<GNADeviceHelper> gnadevice;
@@ -104,15 +98,12 @@ void createRequestConfigsForGnaModels();
     void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap);
     void AddExtension(InferenceEngine::IExtensionPtr extension) override;
 
-    std::vector<std::string> supportedConfigKeys() const;
-    std::map<std::string, std::string> supportedConfigKeysWithDefaults() const;
-
     void SetConfig(const std::map<std::string, std::string> &config) override;
     void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork,
                      const InferenceEngine::ICNNNetwork &network,
-                     const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
+                     const std::map<std::string, std::string> &config_map) override { THROW_GNA_EXCEPTION << "Not implemented"; }
     InferenceEngine::ExecutableNetwork LoadNetwork(const InferenceEngine::ICNNNetwork &network,
-                                  const std::map<std::string, std::string> &config,
+                                  const std::map<std::string, std::string> &config_map,
                                   InferenceEngine::RemoteContext::Ptr context) override { THROW_GNA_EXCEPTION << "Not implemented"; }
     void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result);
     void SetCore(InferenceEngine::ICore*) noexcept override {}
@@ -221,5 +212,8 @@ void createRequestConfigsForGnaModels();
                     const GNASplitLayer& splitInfo,
                     size_t precision_size,
                     int idx = 0);
+
+    void UpdateFieldsFromConfig();
 };
+
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.cpp b/inference-engine/src/gna_plugin/gna_plugin_config.cpp
new file mode 100644 (file)
index 0000000..5315920
--- /dev/null
@@ -0,0 +1,278 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gna/gna_config.hpp>
+#include "gna_plugin.hpp"
+#include "gna_plugin_config.hpp"
+#include "ie_common.h"
+#include <details/caseless.hpp>
+#include <unordered_map>
+
+using namespace InferenceEngine;
+using namespace InferenceEngine::details;
+
+namespace GNAPluginNS {
+
+#if GNA_LIB_VER == 1
+static caseless_unordered_map<std::string, uint32_t> supported_values = {
+        {GNAConfigParams::GNA_AUTO,     GNA_AUTO},
+        {GNAConfigParams::GNA_HW,       GNA_HARDWARE},
+        {GNAConfigParams::GNA_SW,       GNA_SOFTWARE},
+        {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
+};
+static std::vector<std::string> supported_values_on_gna2 = {
+        GNAConfigParams::GNA_GEN,
+        GNAConfigParams::GNA_GEN_EXACT,
+        GNAConfigParams::GNA_SSE,
+        GNAConfigParams::GNA_SSE_EXACT,
+        GNAConfigParams::GNA_AVX1,
+        GNAConfigParams::GNA_AVX1_EXACT,
+        GNAConfigParams::GNA_AVX2,
+        GNAConfigParams::GNA_AVX2_EXACT
+};
+#else
+static caseless_unordered_map <std::string, std::pair<Gna2AccelerationMode, Gna2DeviceVersion>> supported_values = {
+                {GNAConfigParams::GNA_AUTO,       {Gna2AccelerationModeAuto,     Gna2DeviceVersionSoftwareEmulation}},
+                {GNAConfigParams::GNA_HW,         {Gna2AccelerationModeHardware, Gna2DeviceVersionSoftwareEmulation}},
+                {GNAConfigParams::GNA_SW,         {Gna2AccelerationModeSoftware, Gna2DeviceVersionSoftwareEmulation}},
+                {GNAConfigParams::GNA_SW_EXACT,   {Gna2AccelerationModeSoftware, Gna2DeviceVersion1_0}},
+                {GNAConfigParams::GNA_GEN,        {Gna2AccelerationModeGeneric,  Gna2DeviceVersionSoftwareEmulation}},
+                {GNAConfigParams::GNA_GEN_EXACT,  {Gna2AccelerationModeGeneric,  Gna2DeviceVersion1_0}},
+                {GNAConfigParams::GNA_SSE,        {Gna2AccelerationModeSse4x2,   Gna2DeviceVersionSoftwareEmulation}},
+                {GNAConfigParams::GNA_SSE_EXACT,  {Gna2AccelerationModeSse4x2,   Gna2DeviceVersion1_0}},
+                {GNAConfigParams::GNA_AVX1,       {Gna2AccelerationModeAvx1,     Gna2DeviceVersionSoftwareEmulation}},
+                {GNAConfigParams::GNA_AVX1_EXACT, {Gna2AccelerationModeAvx1,     Gna2DeviceVersion1_0}},
+                {GNAConfigParams::GNA_AVX2,       {Gna2AccelerationModeAvx2,     Gna2DeviceVersionSoftwareEmulation}},
+                {GNAConfigParams::GNA_AVX2_EXACT, {Gna2AccelerationModeAvx2,     Gna2DeviceVersion1_0}},
+        };
+#endif
+
+void Config::UpdateFromMap(const std::map<std::string, std::string>& config) {
+    for (auto&& item : config) {
+        auto key = item.first;
+        auto value = item.second;
+
+        auto fp32eq = [](float p1, float p2) -> bool {
+            return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
+        };
+
+        auto &log = gnalog();
+
+        if (key.find(GNA_CONFIG_KEY(SCALE_FACTOR)) == 0) {
+            uint64_t input_index;
+            if (key == GNA_CONFIG_KEY(SCALE_FACTOR)) {
+                input_index = 0;
+            } else {
+                key.erase(0, strlen(GNA_CONFIG_KEY(SCALE_FACTOR)));
+                if (key[0] != '_') {
+                    THROW_GNA_EXCEPTION << "Invalid format of scale factor configuration key";
+                }
+                key.erase(0, 1);
+                try {
+                    input_index = std::stoi(key);
+                    if (input_index < 0 | input_index > 9) {
+                        throw std::out_of_range("");
+                    }
+                } catch (std::invalid_argument&) {
+                    THROW_GNA_EXCEPTION << "Invalid value of index of input scale factor";
+                } catch (std::out_of_range&) {
+                    THROW_GNA_EXCEPTION << "Index of input scale factor must be in the range [0..9], " << value << " provided";
+                }
+            }
+            auto scale_factor = InferenceEngine::CNNLayer::ie_parse_float(value);
+            if (fp32eq(scale_factor, 0.0f)) {
+                THROW_GNA_EXCEPTION << "input scale factor of 0.0f not supported";
+            }
+            // missing scale factors are set to be 1.0f
+            if (inputScaleFactors.size() <= input_index) {
+                inputScaleFactors.resize(input_index + 1, 1.f);
+            }
+            inputScaleFactors[input_index] = InferenceEngine::CNNLayer::ie_parse_float(value);
+        } else if (key == GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE)) {
+            dumpXNNPath = value;
+        } else if (key == GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION)) {
+            dumpXNNGeneration = value;
+        } else if (key == GNA_CONFIG_KEY(DEVICE_MODE)) {
+            auto procType = supported_values.find(value);
+            if (procType == supported_values.end()) {
+                if (value == GNA_CONFIG_VALUE(SW_FP32)) {
+                    gnaFlags.sw_fp32 = true;
+                } else {
+#if GNA_LIB_VER == 1
+                    auto is_gna2_mode = std::find(
+                            supported_values_on_gna2.begin(),
+                            supported_values_on_gna2.end(),
+                            value);
+                    if (is_gna2_mode != supported_values_on_gna2.end()) {
+                        THROW_GNA_EXCEPTION << "This GNA device mode requires GNA2 library: " << value;
+                    }
+#endif
+                    THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
+                }
+            } else {
+#if GNA_LIB_VER == 1
+                gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
+#else
+                pluginGna2AccMode = procType->second.first;
+                pluginGna2DeviceConsistent = procType->second.second;
+#endif
+            }
+        } else if (key == GNA_CONFIG_KEY(COMPACT_MODE)) {
+            if (value == PluginConfigParams::YES) {
+                gnaFlags.compact_mode = true;
+            } else if (value == PluginConfigParams::NO) {
+                gnaFlags.compact_mode = false;
+            } else {
+                log << "GNA compact mode should be YES/NO, but not " << value;
+                THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not " << value;
+            }
+        } else if (key == CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS)) {
+            if (value == PluginConfigParams::YES) {
+                gnaFlags.exclusive_async_requests = true;
+            } else if (value == PluginConfigParams::NO) {
+                gnaFlags.exclusive_async_requests = false;
+            } else {
+                log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+                THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+            }
+        } else if (key == GNA_CONFIG_KEY(PRECISION)) {
+            auto precision = Precision::FromStr(value);
+            if (precision != Precision::I8 && precision != Precision::I16) {
+                log << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
+                THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: "
+                                    << value;
+            }
+            gnaPrecision = precision;
+        } else if (key == GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN)) {
+            if (value == PluginConfigParams::YES) {
+                gnaFlags.uniformPwlDesign = true;
+            } else if (value == PluginConfigParams::NO) {
+                gnaFlags.uniformPwlDesign = false;
+            } else {
+                log << "GNA pwl uniform algorithm parameter "
+                    << "should be equal to YES/NO, but not" << value;
+                THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
+                                    << "should be equal to YES/NO, but not" << value;
+            }
+        } else if (key == CONFIG_KEY(PERF_COUNT)) {
+            if (value == PluginConfigParams::YES) {
+                gnaFlags.performance_counting = true;
+            } else if (value == PluginConfigParams::NO) {
+                gnaFlags.performance_counting = false;
+            } else {
+                log << "GNA performance counter enabling parameter "
+                    << "should be equal to YES/NO, but not" << value;
+                THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
+                                    << "should be equal to YES/NO, but not" << value;
+            }
+        } else if (key == GNA_CONFIG_KEY(LIB_N_THREADS)) {
+            uint64_t lib_threads;
+            try {
+                lib_threads = std::stoul(value);
+                if (lib_threads == 0 || lib_threads > (std::numeric_limits<uint8_t>::max()+1) / 2 - 1) {
+                    throw std::out_of_range("");
+                }
+            } catch (std::invalid_argument&) {
+                THROW_GNA_EXCEPTION << "Invalid value of number of threads";
+            } catch (std::out_of_range&) {
+                log << "Unsupported accelerator lib number of threads: " << value
+                    << ", should be greater than 0 and less than 127";
+                THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
+                                    << ", should be greater than 0 and less than 127";
+            }
+            gnaFlags.gna_lib_async_threads_num = lib_threads;
+        } else if (key == CONFIG_KEY(SINGLE_THREAD)) {
+            if (value == PluginConfigParams::YES) {
+                gnaFlags.gna_openmp_multithreading = false;
+            } else if (value == PluginConfigParams::NO) {
+                gnaFlags.gna_openmp_multithreading = true;
+            } else {
+                log << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+                THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+            }
+        } else {
+            THROW_GNA_EXCEPTION << as_status << NOT_FOUND << "Incorrect GNA Plugin config. Key " << item.first
+                                << " not supported";
+        }
+
+        if (gnaFlags.sw_fp32 && gnaFlags.gna_lib_async_threads_num > 1) {
+            THROW_GNA_EXCEPTION << "GNA plugin does not support async mode on GNA_SW_FP32!";
+        }
+    }
+
+    if (inputScaleFactors.empty()) {
+        inputScaleFactors.push_back(1.0f);
+    }
+
+    AdjustKeyMapValues();
+}
+
+void Config::AdjustKeyMapValues() {
+    key_config_map.clear();
+
+    if (inputScaleFactors.empty()) {
+        inputScaleFactors.push_back(1.0);
+    }
+    key_config_map[GNA_CONFIG_KEY(SCALE_FACTOR)] = std::to_string(inputScaleFactors[0]);
+    for (int n = 0; n < inputScaleFactors.size(); n++) {
+        key_config_map[GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_") + std::to_string(n)] =
+                std::to_string(inputScaleFactors[n]);
+    }
+    key_config_map[GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE)] = dumpXNNPath;
+    key_config_map[GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION)] = dumpXNNGeneration;
+
+    std::string device_mode;
+    if (gnaFlags.sw_fp32) {
+        device_mode = GNA_CONFIG_VALUE(SW_FP32);
+    } else {
+        for (auto&& value : supported_values) {
+#if GNA_LIB_VER == 1
+            if (value.second == gna_proc_type) {
+                device_mode = value.first;
+                break;
+            }
+#else
+            if (value.second.first == pluginGna2AccMode &&
+                value.second.second == pluginGna2DeviceConsistent) {
+                device_mode = value.first;
+            break;
+        }
+#endif
+        }
+    }
+    IE_ASSERT(!device_mode.empty());
+    key_config_map[GNA_CONFIG_KEY(DEVICE_MODE)] = device_mode;
+    key_config_map[GNA_CONFIG_KEY(COMPACT_MODE)] =
+            gnaFlags.compact_mode ? PluginConfigParams::YES: PluginConfigParams::NO;
+    key_config_map[CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS)] =
+            gnaFlags.exclusive_async_requests ? PluginConfigParams::YES: PluginConfigParams::NO;
+    key_config_map[GNA_CONFIG_KEY(PRECISION)] = gnaPrecision.name();
+    key_config_map[CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS)] =
+            gnaFlags.exclusive_async_requests ? PluginConfigParams::YES: PluginConfigParams::NO;
+    key_config_map[GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN)] =
+            gnaFlags.uniformPwlDesign ? PluginConfigParams::YES: PluginConfigParams::NO;
+    key_config_map[CONFIG_KEY(PERF_COUNT)] =
+            gnaFlags.performance_counting ? PluginConfigParams::YES: PluginConfigParams::NO;
+    key_config_map[GNA_CONFIG_KEY(LIB_N_THREADS)] = std::to_string(gnaFlags.gna_lib_async_threads_num);
+    key_config_map[CONFIG_KEY(SINGLE_THREAD)] =
+            gnaFlags.gna_openmp_multithreading ? PluginConfigParams::NO: PluginConfigParams::YES;
+}
+
+std::string Config::GetParameter(const std::string& name) const {
+    auto result = key_config_map.find(name);
+    if (result == key_config_map.end()) {
+        THROW_GNA_EXCEPTION << "Unsupported config key: " << name;
+    }
+    return result->second;
+}
+
+std::vector<std::string> Config::GetSupportedKeys() const {
+    std::vector<std::string> result;
+    for (auto&& configOption : key_config_map) {
+        result.push_back(configOption.first);
+    }
+    return result;
+}
+
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.hpp b/inference-engine/src/gna_plugin/gna_plugin_config.hpp
new file mode 100644 (file)
index 0000000..4bc24bd
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#if GNA_LIB_VER == 1
+#include <gna-api.h>
+#else
+#include <gna2-inference-api.h>
+#include <gna2-common-api.h>
+#endif
+#include "ie_precision.hpp"
+#include "descriptions/gna_flags.hpp"
+#include <vector>
+#include <map>
+
+namespace GNAPluginNS {
+
+struct Config {
+    Config() {
+        AdjustKeyMapValues();
+    }
+    void UpdateFromMap(const std::map<std::string, std::string>& configMap);
+    void AdjustKeyMapValues();
+    std::string GetParameter(const std::string& name) const;
+    std::vector<std::string> GetSupportedKeys() const;
+
+    // precision of GNA hardware model
+    InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
+
+    std::string dumpXNNPath;
+    std::string dumpXNNGeneration;
+
+#if GNA_LIB_VER == 1
+    intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
+#else
+    Gna2AccelerationMode pluginGna2AccMode = Gna2AccelerationModeSoftware;
+    Gna2DeviceVersion pluginGna2DeviceConsistent = Gna2DeviceVersion1_0;
+#endif
+
+    std::vector<float> inputScaleFactors;
+    GNAFlags gnaFlags;
+
+    std::map<std::string, std::string> key_config_map;
+};
+
+}  // namespace GNAPluginNS
index 7c5a8c7..dc5a584 100644 (file)
 #include <cpp_interfaces/impl/ie_plugin_internal.hpp>
 #include <cpp_interfaces/impl/ie_executable_network_internal.hpp>
 #include "gna_executable_network.hpp"
+#include "gna_plugin_config.hpp"
 
 namespace GNAPluginNS {
 
 class GNAPluginInternal  : public InferenceEngine::InferencePluginInternal {
- public:
+private:
+    Config defaultConfig;
+    std::weak_ptr <GNAPlugin> plgPtr;
+    std::shared_ptr<GNAPlugin> GetCurrentPlugin() const {
+        auto ptr = plgPtr.lock();
+        if (ptr == nullptr) {
+            return std::make_shared<GNAPlugin>();
+        } else {
+            return ptr;
+        }
+    }
+
+public:
     InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(const InferenceEngine::ICore * core,
                                                 const InferenceEngine::ICNNNetwork &network,
                                                 const std::map<std::string, std::string> &config) override {
-        return std::make_shared<GNAExecutableNetwork>(*cloneNet(network), config);
+        Config updated_config(defaultConfig);
+        updated_config.UpdateFromMap(config);
+        auto plg = std::make_shared<GNAPlugin>(updated_config.key_config_map);
+        plgPtr = plg;
+        return std::make_shared<GNAExecutableNetwork>(*cloneNet(network), plg);
     }
+
     void SetConfig(const std::map<std::string, std::string> &config) override {
-        auto plg = std::make_shared<GNAPlugin>();
-        plg->SetConfig(config);
+        defaultConfig.UpdateFromMap(config);
     }
+
     InferenceEngine::IExecutableNetwork::Ptr  ImportNetwork(
                                                 const std::string &modelFileName,
                                                 const std::map<std::string, std::string> &config) override {
-        return make_executable_network(std::make_shared<GNAExecutableNetwork>(modelFileName, config));
+        Config updated_config(defaultConfig);
+        updated_config.UpdateFromMap(config);
+        auto plg = std::make_shared<GNAPlugin>(updated_config.key_config_map);
+        plgPtr = plg;
+        return make_executable_network(std::make_shared<GNAExecutableNetwork>(modelFileName, plg));
     }
+
     using InferenceEngine::InferencePluginInternal::ImportNetwork;
 
     std::string GetName() const noexcept override {
-        auto plg = std::make_shared<GNAPlugin>();
-        return plg->GetName();
+        return GetCurrentPlugin()->GetName();
     }
 
     void QueryNetwork(const InferenceEngine::ICNNNetwork& network,
                       const std::map<std::string, std::string>& config,
                       InferenceEngine::QueryNetworkResult& res) const override {
-        auto plg = std::make_shared<GNAPlugin>();
+        auto plg = GetCurrentPlugin();
         try {
             plg->SetConfig(config);
         } catch (InferenceEngine::details::InferenceEngineException) {}
@@ -48,13 +70,11 @@ class GNAPluginInternal  : public InferenceEngine::InferencePluginInternal {
 
     InferenceEngine::Parameter GetMetric(const std::string& name,
                                          const std::map<std::string, InferenceEngine::Parameter> & options) const override {
-        GNAPlugin statelessPlugin;
-        return statelessPlugin.GetMetric(name, options);
+        return GetCurrentPlugin()->GetMetric(name, options);
     }
 
     InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string, InferenceEngine::Parameter> & options) const override {
-        GNAPlugin statelessPlugin;
-        return statelessPlugin.GetConfig(name, options);
+        return defaultConfig.GetParameter(name);
     }
 };
 
index 38dab44..8e39258 100644 (file)
@@ -37,8 +37,9 @@ class Policy {
     enum class ConcatAlignment {
         DISABLED,
         DISABLED_FOR_FP32,
-        ENABLED
-    } ConcatAlignmentPolicy = ConcatAlignment::ENABLED;
+        ENABLED,
+        FAST
+    } ConcatAlignmentPolicy = ConcatAlignment::FAST;
 };
 
 inline std::ostream& operator<<(std::ostream& os, Policy::ScaleShift policy) {
@@ -51,4 +52,16 @@ inline std::ostream& operator<<(std::ostream& os, Policy::ScaleShift policy) {
     return os;
 }
 
+inline std::ostream& operator<<(std::ostream& os, Policy::ConcatAlignment policy) {
+    switch (policy) {
+        case Policy::ConcatAlignment::DISABLED   : os << "DISABLED";    break;
+        case Policy::ConcatAlignment::DISABLED_FOR_FP32   : os << "DISABLED_FOR_FP32";    break;
+        case Policy::ConcatAlignment::ENABLED   : os << "ENABLED";    break;
+        case Policy::ConcatAlignment::FAST   : os << "FAST";    break;
+        default    : os.setstate(std::ios_base::failbit);
+    }
+    return os;
+}
+
+
 }  // namespace GNAPluginNS
index eaa116d..9d16642 100644 (file)
@@ -16,19 +16,14 @@ using namespace GNAPluginNS;
 using namespace InferenceEngine;
 using namespace InferenceEngine::PluginConfigParams;
 
-Parameter GNAPlugin::GetConfig(const std::string& name, const std::map<std::string, Parameter> & options) const {
-    auto configKeys = supportedConfigKeysWithDefaults();
-    auto result = configKeys.find(name);
-    if (result == configKeys.end()) {
-        THROW_GNA_EXCEPTION << "unsupported config key: " << name;
-    }
-    return result->second;
+Parameter GNAPlugin::GetConfig(const std::string& name, const std::map<std::string, Parameter> & /*options*/) const {
+    return config.GetParameter(name);
 }
 
 Parameter GNAPlugin::GetMetric(const std::string& name, const std::map<std::string, InferenceEngine::Parameter> & options) const {
     const std::unordered_map<std::string, std::function<Parameter()>> queryApiSupported = {
         {METRIC_KEY(AVAILABLE_DEVICES), [this]() {return GetAvailableDevices();}},
-        {METRIC_KEY(SUPPORTED_CONFIG_KEYS), [this]() {return supportedConfigKeys();}},
+        {METRIC_KEY(SUPPORTED_CONFIG_KEYS), [this]() {return config.GetSupportedKeys();}},
         {METRIC_KEY(FULL_DEVICE_NAME), [&options, this]() {
             auto availableDevices = GetAvailableDevices().as<std::vector<std::string>>();
 
@@ -100,29 +95,3 @@ Parameter GNAPlugin::GetAvailableDevices() const {
 
     return devices;
 }
-
-std::map<std::string, std::string> GNAPlugin::supportedConfigKeysWithDefaults() const {
-    std::map<std::string, std::string>  options = {
-        {GNA_CONFIG_KEY(SCALE_FACTOR), "1.0"},
-        {GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), ""},
-        {GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION), ""},
-        {GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_AUTO},
-        {GNA_CONFIG_KEY(COMPACT_MODE), CONFIG_VALUE(NO)},
-        {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(NO)},
-        {GNA_CONFIG_KEY(PRECISION), Precision(Precision::I8).name()},
-        {GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), CONFIG_VALUE(YES)},
-        {CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(NO)},
-        {GNA_CONFIG_KEY(LIB_N_THREADS), "1"},
-        {CONFIG_KEY(SINGLE_THREAD), CONFIG_VALUE(YES)}
-    };
-    return options;
-}
-
-
-std::vector<std::string> GNAPlugin::supportedConfigKeys()const {
-    std::vector<std::string> result;
-    for (auto && configOption : supportedConfigKeysWithDefaults()) {
-        result.push_back(configOption.first);
-    }
-    return result;
-}
index 1614ca6..0c4b9f1 100644 (file)
@@ -675,6 +675,10 @@ void InsertCopyLayerPass::run() {
 
 void InsertConcatAligningFilterPass::run() {
     auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
+
+    if (getPassManager()->getPolicy().ConcatAlignmentPolicy == Policy::ConcatAlignment::DISABLED) {
+        return;
+    }
     // aligning specific not required in fp32 mode
     if (getPassManager()->getPolicy().ConcatAlignmentPolicy == Policy::ConcatAlignment::DISABLED_FOR_FP32 && !quantized) {
         return;
@@ -740,6 +744,10 @@ void InsertConcatAligningFilterPass::run() {
                 // encodes offset to beginning of split layer input
                 concatAligningFilter->params["output_offset"] =
                         std::to_string((aligned64_offset / bytesPerConcatElement) * (quantized ? bytesPerConcatElement : 4));
+
+                // for padded rows we cannot use copy layer - TBD how to implement
+                concatAligningFilter->params["num_rows_padded"] = std::to_string(num_rows_padded);
+
                 // encodes original output size
                 concatAligningFilter->params["original_num_rows"] = std::to_string(num_rows_in);
 
@@ -1084,7 +1092,7 @@ int PassManager::run(int index) {
         saveGraphToDot(*network.get(), out, [](const CNNLayerPtr layer,
                                                ordered_properties &printed_properties,
                                                ordered_properties &node_properties) {});
-        network->serialize(name + ".xml", "", nullptr);
+        network->serialize(name + ".xml", name + ".bin", nullptr);
     };
 #else
     auto dumpNetworkAfterPass = [] (std::shared_ptr<Pass> ) {};
index c5513f3..24c091e 100644 (file)
@@ -48,7 +48,7 @@ HeteroAsyncInferRequest::HeteroAsyncInferRequest(const HeteroInferRequest::Ptr&
 
 void HeteroAsyncInferRequest::StartAsync_ThreadUnsafe() {
     _heteroInferRequest->updateInOutIfNeeded();
-    RunFirstStage();
+    RunFirstStage(_pipeline.begin(), _pipeline.end());
 }
 
 StatusCode HeteroAsyncInferRequest::Wait(int64_t millis_timeout) {
index 2e32626..f4e221d 100644 (file)
@@ -65,6 +65,27 @@ HeteroInferRequest::HeteroInferRequest(InferenceEngine::InputsDataMap networkInp
     }
 }
 
+void HeteroInferRequest::SetBlob(const char* name, const InferenceEngine::Blob::Ptr& data) {
+    InferenceEngine::InferRequestInternal::SetBlob(name, data);
+    assert(!_inferRequests.empty());
+    for (auto &&desc : _inferRequests) {
+        auto &r = desc._request;
+        assert(nullptr != r);
+        InputInfo::Ptr foundInput;
+        DataPtr foundOutput;
+        try {
+            // if `name` is input blob
+            if (findInputAndOutputBlobByName(name, foundInput, foundOutput)) {
+                r->SetBlob(name, data, foundInput->getPreProcess());
+            }
+        } catch (const InferenceEngine::details::InferenceEngineException & ex) {
+            std::string message = ex.what();
+            if (message.find(NOT_FOUND_str) == std::string::npos)
+                throw ex;
+        }
+    }
+}
+
 void HeteroInferRequest::InferImpl() {
     updateInOutIfNeeded();
     size_t i = 0;
index 72099b4..18163f7 100644 (file)
@@ -39,6 +39,8 @@ public:
 
     void InferImpl() override;
 
+    void SetBlob(const char* name, const InferenceEngine::Blob::Ptr& data) override;
+
     void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) const override;
 
     void updateInOutIfNeeded();
index 2071c12..54c9a23 100644 (file)
@@ -17,6 +17,8 @@ set(IE_STATIC_DEPENDENT_FILES ${CMAKE_CURRENT_SOURCE_DIR}/file_utils.cpp)
 list(REMOVE_ITEM LIBRARY_SRC ${IE_STATIC_DEPENDENT_FILES})
 
 set(IE_BASE_SOURCE_FILES
+      ${CMAKE_CURRENT_SOURCE_DIR}/cnn_network_ngraph_impl.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/generic_ie.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/blob_factory.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/ie_data.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/ie_layouts.cpp
@@ -94,7 +96,6 @@ file(GLOB_RECURSE plugin_api_src "${IE_MAIN_SOURCE_DIR}/src/plugin_api/*.hpp"
                                  "${IE_MAIN_SOURCE_DIR}/src/plugin_api/*.h")
 
 add_cpplint_target(${TARGET_NAME}_plugin_api_cpplint FOR_SOURCES ${plugin_api_src})
-add_clang_format_target(${TARGET_NAME}_plugin_api_clang_format FOR_SOURCES ${plugin_api_src})
 
 # Create common base object library
 
@@ -103,6 +104,7 @@ add_library(${TARGET_NAME}_common_obj OBJECT
 
 target_compile_definitions(${TARGET_NAME}_common_obj PRIVATE IMPLEMENT_INFERENCE_ENGINE_API)
 target_include_directories(${TARGET_NAME}_common_obj PRIVATE
+    $<TARGET_PROPERTY:${TARGET_NAME}_transformations,INTERFACE_INCLUDE_DIRECTORIES>
     $<TARGET_PROPERTY:${TARGET_NAME}_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
 
 target_include_directories(${TARGET_NAME}_common_obj SYSTEM PRIVATE
@@ -121,6 +123,7 @@ target_include_directories(${TARGET_NAME}_obj SYSTEM PRIVATE $<TARGET_PROPERTY:n
                                                              $<TARGET_PROPERTY:pugixml,INTERFACE_INCLUDE_DIRECTORIES>)
 
 target_include_directories(${TARGET_NAME}_obj PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+                                                      $<TARGET_PROPERTY:${TARGET_NAME}_ir_readers,INTERFACE_INCLUDE_DIRECTORIES>
                                                       $<TARGET_PROPERTY:${TARGET_NAME}_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
 
 if(ENABLE_PROFILING_ITT AND INTEL_ITT_LIBS)
@@ -146,7 +149,6 @@ if(WIN32)
 endif()
 
 add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}_obj)
-add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME}_obj)
 
 # Create shared library file from object library
 
@@ -196,7 +198,6 @@ target_include_directories(${TARGET_NAME}_nn_builder PRIVATE "${CMAKE_CURRENT_SO
     "${IE_MAIN_SOURCE_DIR}/src/legacy_api/src")
 
 add_cpplint_target(${TARGET_NAME}_nn_builder_cpplint FOR_TARGETS ${TARGET_NAME}_nn_builder)
-add_clang_format_target(${TARGET_NAME}_nn_builder_clang_format FOR_TARGETS ${TARGET_NAME}_nn_builder)
 
 # Static library used for unit tests which are always built
 
@@ -272,6 +273,10 @@ if(THREADING STREQUAL "TBB" OR THREADING STREQUAL "TBB_AUTO")
     install(FILES "${TBB}/LICENSE"
             DESTINATION ${IE_CPACK_IE_DIR}/external/tbb
             COMPONENT tbb)
+    install(FILES "${TBB}/cmake/TBBConfig.cmake"
+                  "${TBB}/cmake/TBBConfigVersion.cmake"
+            DESTINATION ${IE_CPACK_IE_DIR}/external/tbb/cmake
+            COMPONENT tbb)
 endif()
 
 ie_cpack_add_component(core REQUIRED DEPENDS ${core_components})
@@ -279,8 +284,8 @@ ie_cpack_add_component(core REQUIRED DEPENDS ${core_components})
 install(DIRECTORY "${IE_MAIN_SOURCE_DIR}/include" DESTINATION ${IE_CPACK_IE_DIR}
         COMPONENT core)
 install(TARGETS ${TARGET_NAME} ${TARGET_NAME}_nn_builder
-        RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
-        ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+        RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+        ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
         LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
 install(FILES "${OpenVINO_BINARY_DIR}/share/ie_parallel.cmake"
               "${OpenVINO_BINARY_DIR}/share/InferenceEngineConfig.cmake"
index e972b39..8572297 100644 (file)
@@ -46,6 +46,8 @@ InferenceEngine::Blob::Ptr CreateBlobFromData(const InferenceEngine::DataPtr& da
         return std::make_shared<InferenceEngine::TBlob<int8_t>>(desc);
     case InferenceEngine::Precision::I32:
         return std::make_shared<InferenceEngine::TBlob<int32_t>>(desc);
+    case InferenceEngine::Precision::BF16:
+            return std::make_shared<InferenceEngine::TBlob<short>>(desc);
     default:
         THROW_IE_EXCEPTION << "precision is no set";
     }
index 371090c..f5af3b8 100644 (file)
@@ -28,7 +28,6 @@
 #include "graph_tools.hpp"
 #include "graph_transformer.h"
 #include "ie_util_internal.hpp"
-#include "ie_cnn_layer_builder_ngraph.h"
 #include "ie_ngraph_utils.hpp"
 #include "ie_profiling.hpp"
 #include "network_serializer.h"
@@ -97,7 +96,7 @@ void CNNNetworkNGraphImpl::createDataForResult(const ::ngraph::Output<::ngraph::
     if (ptr) {
         ptr->reshape(dims, ptr->getTensorDesc().getLayout());
     } else {
-        const auto precision = details::ngraph::convertPrecision(output.get_element_type());
+        const auto precision = details::convertPrecision(output.get_element_type());
         const auto layout = TensorDesc::getLayoutByDims(dims);
         ptr.reset(new NGraphData(this, outName, {precision, dims, layout}));
     }
@@ -520,287 +519,3 @@ void CNNNetworkNGraphImpl::convertToCNNNetworkImpl() {
     ::ngraph::pass::ConvertOpSet1ToLegacy().run_on_function(graph);
     cnnNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(graph, *this);
 }
-
-std::shared_ptr<CNNNetworkNGraphImpl> CNNNetworkNGraphImpl::cloneNGraphImpl() const {
-    auto result = std::make_shared<CNNNetworkNGraphImpl>(cloneFunction());
-    for (const auto& outputInfo : _outputData) {
-        result->_outputData[outputInfo.first]->setPrecision(outputInfo.second->getPrecision());
-        result->_outputData[outputInfo.first]->setLayout(outputInfo.second->getLayout());
-    }
-    for (const auto& inputInfo : _inputData) {
-        result->_inputData[inputInfo.first]->setPrecision(inputInfo.second->getPrecision());
-        result->_inputData[inputInfo.first]->setLayout(inputInfo.second->getLayout());
-        result->_inputData[inputInfo.first]->getPreProcess() = inputInfo.second->getPreProcess();
-    }
-    if (cnnNetwork)
-        result->cnnNetwork = cloneNet(*cnnNetwork);
-    return result;
-}
-
-void CNNNetworkNGraphImpl::transformConstants() {
-    if (!cnnNetwork)
-        convertToCNNNetworkImpl();
-    // Remove all redundant constant and convert unsupported precisions
-    ConstTransformer transformator(cnnNetwork.get());
-    transformator.fullTrim();
-}
-
-void InferenceEngine::details::CNNLayerCreator::on_adapter(const std::string& name,
-                                                           ::ngraph::ValueAccessor<void>& adapter) {
-    if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::element::Type>>(&adapter)) {
-        auto type = static_cast<::ngraph::element::Type&>(*a);
-        params[name] = details::ngraph::convertPrecision(type).name();
-    } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::PartialShape>>(&adapter)) {
-        std::string dims;
-        auto shape = static_cast<::ngraph::PartialShape&>(*a);
-        for (size_t i = 0; i < shape.rank().get_length(); i++) {
-            if (!dims.empty()) dims += ",";
-            dims += std::to_string(shape[i].get_length());
-        }
-        params[name] = dims;
-    } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::Shape>>(&adapter)) {
-        std::string dims;
-        auto shape = static_cast<::ngraph::Shape&>(*a);
-        for (size_t i = 0; i < shape.size(); i++) {
-            if (!dims.empty()) dims += ",";
-            dims += std::to_string(shape[i]);
-        }
-        params[name] = dims;
-    } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::Strides>>(&adapter)) {
-        std::string dims;
-        auto shape = static_cast<::ngraph::Strides&>(*a);
-        for (size_t i = 0; i < shape.size(); i++) {
-            if (!dims.empty()) dims += ",";
-            dims += std::to_string(shape[i]);
-        }
-        params[name] = dims;
-    }
-}
-
-InferenceEngine::details::CNNLayerCreator::CNNLayerCreator(const std::shared_ptr<::ngraph::Node>& node): node(node) {
-    addSpecificCreator({"Parameter"}, [](const std::shared_ptr<::ngraph::Node>& node,
-                                         const std::map<std::string, std::string> params) -> CNNLayerPtr {
-        LayerParams attrs = {node->get_friendly_name(), "Input",
-            details::ngraph::convertPrecision(node->get_output_element_type(0))};
-        auto res = std::make_shared<CNNLayer>(attrs);
-        return res;
-    });
-    // TODO - Remove "GreaterEq" once ngraph transitions to GreaterEqual
-    addSpecificCreator({"Eltwise", "Subtract", "Power", "Maximum", "Divide", "Greater", "GreaterEqual", "FloorMod", "LogicalOr", "LogicalAnd", "LogicalXor",
-        "GreaterEq", "Less", "LessEqual", "Equal", "NotEqual", "Multiply", "Add"}, [](const std::shared_ptr<::ngraph::Node>& node,
-                                                                 const std::map<std::string, std::string> params) -> CNNLayerPtr {
-            LayerParams attrs = {node->get_friendly_name(), "Eltwise",
-                details::ngraph::convertPrecision(node->get_output_element_type(0))};
-            auto res = std::make_shared<EltwiseLayer>(attrs);
-            res->params = params;
-            if (node->description() == "Maximum") {
-                res->params["operation"] = "max";
-            } else if (node->description() == "Power") {
-                res->params["operation"] = "pow";
-            } else if (node->description() == "Subtract") {
-                res->params["operation"] = "sub";
-            } else if (node->description() == "Divide") {
-                res->params["operation"] = "div";
-            } else if (node->description() == "LessEqual") {
-                res->params["operation"] = "less_equal";
-            } else if (node->description() == "Less") {
-                res->params["operation"] = "less";
-            } else if (node->description() == "Equal") {
-                res->params["operation"] = "equal";
-            } else if (node->description() == "NotEqual") {
-                res->params["operation"] = "not_equal";
-            } else if (node->description() == "FloorMod") {
-                res->params["operation"] = "floor_mod";
-            } else if (node->description() == "Multiply") {
-                res->params["operation"] = "prod";
-            } else if (node->description() == "Add") {
-                res->params["operation"] = "sum";
-            } else if (node->description() == "Greater") {
-                res->params["operation"] = "greater";
-            } else if (node->description() == "GreaterEq") {
-                res->params["operation"] = "greater_equal";
-            } else if (node->description() == "GreaterEqual") {
-                res->params["operation"] = "greater_equal";
-            } else if (node->description() == "LogicalOr") {
-                res->params["operation"] = "logical_or";
-            } else if (node->description() == "LogicalAnd") {
-                res->params["operation"] = "logical_and";
-            } else if (node->description() == "LogicalXor") {
-                res->params["operation"] = "logical_xor";
-            } else if (node->description() == "Eltwise") {
-                auto castedLayer = std::dynamic_pointer_cast<::ngraph::op::Eltwise>(node);
-                if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << attrs.type << " layer " << attrs.name;
-                std::string type;
-                switch (castedLayer->eltwise_type) {
-                case ELTWISE_TYPE::Sum:
-                    type = "sum";
-                    break;
-                case ELTWISE_TYPE::Prod:
-                    type = "prod";
-                    break;
-                default:
-                    THROW_IE_EXCEPTION << "Not supported eltwise type!";
-                }
-
-                res->params["operation"] = type;
-            }
-            return res;
-        });
-    addSpecificCreator({"Concat"}, [](const std::shared_ptr<::ngraph::Node>& node,
-                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
-        LayerParams attrs = {node->get_friendly_name(), node->description(),
-            details::ngraph::convertPrecision(node->get_output_element_type(0))};
-        auto res = std::make_shared<ConcatLayer>(attrs);
-        res->params = params;
-        return res;
-    });
-    addSpecificCreator({"AvgPool", "MaxPool"}, [](const std::shared_ptr<::ngraph::Node>& node,
-                                                  const std::map<std::string, std::string> params) -> CNNLayerPtr {
-        LayerParams attrs = {node->get_friendly_name(), "Pooling",
-            details::ngraph::convertPrecision(node->get_output_element_type(0))};
-        auto res = std::make_shared<PoolingLayer>(attrs);
-        res->params = params;
-        if (res->params.find("auto_pad") != res->params.end() &&
-            details::CaselessEq<std::string>()(res->params["auto_pad"], "EXPLICIT"))
-            res->params.erase("auto_pad");
-
-        if (res->params.find("exclude_pad") != res->params.end()) {
-            res->params["exclude-pad"] = res->params["exclude_pad"];
-            res->params.erase("exclude_pad");
-        }
-
-        if (node->description() == "MaxPool") {
-            res->params["pool-method"] = "max";
-        } else if (node->description() == "AvgPool") {
-            res->params["pool-method"] = "avg";
-        }
-        return res;
-    });
-    addSpecificCreator({"Select"}, [](const std::shared_ptr<::ngraph::Node>& node,
-                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
-        LayerParams attrs = {node->get_friendly_name(), node->description(),
-                             details::ngraph::convertPrecision(node->get_output_element_type(0))};
-        auto res = std::make_shared<SelectLayer>(attrs);
-        res->params = params;
-        return res;
-    });
-    addSpecificCreator({"BinaryConvolution"}, [](const std::shared_ptr<::ngraph::Node>& node,
-                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
-        LayerParams attrs = {node->get_friendly_name(), node->description(),
-                             details::ngraph::convertPrecision(node->get_output_element_type(0))};
-        auto res = std::make_shared<BinaryConvolutionLayer>(attrs);
-
-        // todo: investigate difference between ngraph parameters for BinConvolution and the implementation above
-        // this leads to accuracy issue for Precollected_ONNX_ResNet50_88percentinto1bit e2e test
-        // res->params = params;
-
-        auto castedLayer = ::ngraph::as_type_ptr<::ngraph::op::v1::BinaryConvolution>(node);
-
-        std::string value;
-        for (const auto& val : castedLayer->get_pads_begin()) {
-            if (!value.empty()) value += ",";
-            value += Builder::asString(val);
-        }
-        res->params["pads_begin"] = value;
-
-        value.clear();
-        for (const auto& val : castedLayer->get_pads_end()) {
-            if (!value.empty()) value += ",";
-            value += Builder::asString(val);
-        }
-        res->params["pads_end"] = value;
-
-        switch (castedLayer->get_auto_pad()) {
-            case ::ngraph::op::PadType::SAME_UPPER:
-                res->params["auto_pad"] = "same_upper";
-                break;
-            case ::ngraph::op::PadType::SAME_LOWER:
-                res->params["auto_pad"] = "same_lower";
-                break;
-            case ::ngraph::op::PadType::VALID:
-                res->params["auto_pad"] = "valid";
-                break;
-            default:
-                break;
-        }
-
-        value.clear();
-        for (const auto& val : castedLayer->get_strides()) {
-            if (!value.empty()) value += ",";
-            value += Builder::asString(val);
-        }
-        res->params["strides"] = value;
-
-        value.clear();
-        for (const auto& val : castedLayer->get_dilations()) {
-            if (!value.empty()) value += ",";
-            value += Builder::asString(val);
-        }
-        res->params["dilations"] = value;
-
-        // Restore kernel size and output
-        const auto& shape = castedLayer->get_input_shape(1);
-        res->params["output"] = Builder::asString(shape[0]);
-
-        value.clear();
-        for (size_t i = 2; i < shape.size(); i++) {
-            if (!value.empty()) value += ",";
-            value += Builder::asString(shape[i]);
-        }
-        res->params["kernel"] = value;
-
-        switch (castedLayer->get_mode()) {
-            case ::ngraph::op::v1::BinaryConvolution::BinaryConvolutionMode::XNOR_POPCOUNT:
-                res->params["mode"] = "xnor-popcount";
-        }
-
-        auto weights_shape = castedLayer->input(1).get_source_output().get_shape();
-        res->params["input"] = Builder::asString(weights_shape[1]);
-        res->params["pad_value"] = Builder::asString(castedLayer->get_pad_value());
-
-        Builder::NodeConverter<::ngraph::op::Constant> converter;
-
-        const auto weightsNode = castedLayer->get_inputs()[1].get_output().get_node();
-        if (converter.canCreate(weightsNode)) {
-            const auto& weights = converter.createLayer(weightsNode);
-            res->blobs["weights"] = weights->blobs["custom"];
-            res->_weights = weights->blobs["custom"];
-        }
-        return res;
-    });
-
-    addSpecificCreator({"SpaceToBatch"}, [](const std::shared_ptr<::ngraph::Node>& node,
-                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
-        LayerParams attrs = {node->get_friendly_name(), node->description(),
-                             details::ngraph::convertPrecision(node->get_output_element_type(0))};
-        auto res = std::make_shared<SpaceToBatchLayer>(attrs);
-        res->params = params;
-        return res;
-    });
-
-    addSpecificCreator({"BatchToSpace"}, [](const std::shared_ptr<::ngraph::Node>& node,
-                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
-        LayerParams attrs = {node->get_friendly_name(), node->description(),
-                             details::ngraph::convertPrecision(node->get_output_element_type(0))};
-        auto res = std::make_shared<BatchToSpaceLayer>(attrs);
-        res->params = params;
-        return res;
-    });
-}
-
-CNNLayerPtr InferenceEngine::details::CNNLayerCreator::create() {
-    auto one_from = [](const std::string& desc, const std::vector<std::string>& descs) -> bool {
-        for (const auto& d : descs) {
-            if (details::CaselessEq<std::string>()(d, desc)) return true;
-        }
-        return false;
-    };
-    LayerParams attrs = {node->get_friendly_name(), node->description(),
-                         details::ngraph::convertPrecision(node->get_output_element_type(0))};
-    if (creators.find(node->description()) != creators.end())
-        return creators[node->description()](node, params);
-
-    auto res = std::make_shared<CNNLayer>(attrs);
-    res->params = params;
-    return res;
-}
@@ -102,7 +102,7 @@ public:
     std::shared_ptr<const ::ngraph::Function> getFunction() const noexcept override {
         return !cnnNetwork ? _ngraph_function : nullptr;
     }
-    std::shared_ptr<::ngraph::Function> getFunction() noexcept {
+    std::shared_ptr<::ngraph::Function> getFunction() noexcept override {
         return !cnnNetwork ? _ngraph_function : nullptr;
     }
 
@@ -118,9 +118,6 @@ public:
         noexcept override;
 
     void convertToCNNNetworkImpl();
-
-    std::shared_ptr<CNNNetworkNGraphImpl> cloneNGraphImpl() const;
-    void transformConstants();
 protected:
     std::shared_ptr<::ngraph::Function> _ngraph_function;
     virtual std::shared_ptr<::ngraph::Function> cloneFunction(bool constFolding = false, const std::map<std::string,
@@ -142,7 +139,7 @@ private:
 
     friend INFERENCE_ENGINE_API_CPP(std::shared_ptr<CNNNetworkImpl>)
     convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph,
-                                 const CNNNetworkNGraphImpl & nGraphImpl);
+                                 const ICNNNetwork& nGraphImpl);
 
     /**
      * @brief Reshape on the same shape
@@ -196,65 +193,6 @@ private:
 
 IE_SUPPRESS_DEPRECATED_END
 
-/**
- * @brief Creator for CNNLayer from nGraph op
- */
-class CNNLayerCreator : public ::ngraph::AttributeVisitor {
-public:
-    using CreatorFor = std::function<CNNLayerPtr(const std::shared_ptr<::ngraph::Node>& node,
-                                                 const std::map<std::string, std::string> param)>;
-    explicit CNNLayerCreator(const std::shared_ptr<::ngraph::Node>& node);
-
-    CNNLayerPtr create();
-
-    void on_attribute(const std::string& name, std::string& value) override {
-        params[name] = value;
-    }
-
-    void on_attribute(const std::string& name, bool& value) override {
-        params[name] = value ? "true" : "false";
-    }
-
-    void addSpecificCreator(const std::vector<std::string>& forTypes, const CreatorFor& creator) {
-        for (const auto type : forTypes) {
-            creators[type] = creator;
-        }
-    }
-
-    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<std::string>& adapter) override {
-        std::string data = adapter.get();
-        std::transform(data.begin(), data.end(), data.begin(), [](unsigned char c) {
-            return std::tolower(c);
-        });
-        params[name] = data;
-    }
-
-    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<std::vector<int64_t>>& adapter) override {
-        std::string dims;
-        auto shape = adapter.get();
-        for (size_t i = 0; i < shape.size(); i++) {
-            if (!dims.empty()) dims += ",";
-            dims += std::to_string(shape[i]);
-        }
-        params[name] = dims;
-    }
-
-    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<double>& adapter) override {
-        params[name] = std::to_string(adapter.get());
-    }
-
-    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<int64_t>& adapter) override {
-        params[name] = std::to_string(adapter.get());
-    }
-
-    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<void>& adapter) override;
-
-private:
-    std::shared_ptr<::ngraph::Node> node;
-    std::map<std::string, std::string> params;
-    std::map<std::string, CreatorFor> creators;
-};
-
 typedef std::shared_ptr<CNNNetworkNGraphImpl> CNNNetworkNGraphImplPtr;
 }  // namespace details
 }  // namespace InferenceEngine
index a28f5d8..197a096 100644 (file)
@@ -96,7 +96,7 @@ void ngraph::op::GenericIE::validate_and_infer_types() {
                 // Set dynamic output shapes if input shapes are not defined
                 for (size_t i = 0; i < outputs.size(); i++) {
                     const auto& port = outputs[i];
-                    auto type = InferenceEngine::details::ngraph::convertPrecision(port.precision);
+                    auto type = InferenceEngine::details::convertPrecision(port.precision);
                     set_output_type(i, type, PartialShape::dynamic());
                 }
                 return;
@@ -105,7 +105,7 @@ void ngraph::op::GenericIE::validate_and_infer_types() {
             Shape this_ishape = get_input_shape(i);
             InferenceEngine::SizeVector dims = this_ishape;
             InferenceEngine::Blob::Ptr input = make_blob_with_precision(InferenceEngine::TensorDesc(
-                InferenceEngine::details::ngraph::convertPrecision(get_input_element_type(i)), dims,
+                InferenceEngine::details::convertPrecision(get_input_element_type(i)), dims,
                 InferenceEngine::TensorDesc::getLayoutByDims(dims)));
             inputs.emplace_back(input);
         }
@@ -126,6 +126,11 @@ void ngraph::op::GenericIE::validate_and_infer_types() {
             }
         }
 
+        // WA: Proposal shape infer has to know number of outputs
+        if (type == "Proposal" && parameters.find("num_outputs") == parameters.end()) {
+            parameters["num_outputs"] = std::to_string(outputs.size());
+        }
+
         ret = impl->inferShapes(inputs, parameters, blobs, outShapes, nullptr);
         IE_SUPPRESS_DEPRECATED_END
 
@@ -134,7 +139,7 @@ void ngraph::op::GenericIE::validate_and_infer_types() {
         for (size_t i = 0; i < outputs.size(); i++) {
             const auto& port = outputs[i];
             ngraph::Shape outShape(outShapes[i]);
-            auto type = InferenceEngine::details::ngraph::convertPrecision(port.precision);
+            auto type = InferenceEngine::details::convertPrecision(port.precision);
             set_output_type(i, type, PartialShape(outShape));
         }
 
@@ -149,7 +154,7 @@ void ngraph::op::GenericIE::validate_and_infer_types() {
         for (size_t i = 0; i < outputs.size(); i++) {
             const auto& port = outputs[i];
             ngraph::Shape outShape(port.dims);
-            auto type = InferenceEngine::details::ngraph::convertPrecision(port.precision);
+            auto type = InferenceEngine::details::convertPrecision(port.precision);
             set_output_type(i, type, PartialShape(outShape));
         }
         initialized++;
index df41f8c..1c7c73c 100644 (file)
 #include <vector>
 
 #include <ngraph/opsets/opset.hpp>
+#include "cpp/ie_cnn_net_reader.h"
 #include "cpp_interfaces/base/ie_plugin_base.hpp"
 #include "details/ie_exception_conversion.hpp"
 #include "details/ie_so_pointer.hpp"
 #include "file_utils.h"
-#include "ie_cnn_net_reader_impl.h"
 #include "ie_icore.hpp"
-#include "ie_ir_reader.hpp"
 #include "ie_plugin.hpp"
 #include "ie_plugin_config.hpp"
 #include "ie_profiling.hpp"
@@ -38,6 +37,27 @@ IE_SUPPRESS_DEPRECATED_START
 
 namespace {
 
+std::once_flag flag;
+std::shared_ptr<InferenceEngine::details::SharedObjectLoader> cnnReaderLoader;
+
+std::shared_ptr<InferenceEngine::details::SharedObjectLoader>
+createCnnReaderLoader() {
+    std::call_once(flag, [&] () {
+        FileUtils::FilePath libraryName = FileUtils::toFilePath(std::string("inference_engine_ir_readers") + std::string(IE_BUILD_POSTFIX));
+        FileUtils::FilePath irReadersLibraryPath = FileUtils::makeSharedLibraryName(getInferenceEngineLibraryPath(), libraryName);
+
+        if (!FileUtils::fileExist(irReadersLibraryPath)) {
+            THROW_IE_EXCEPTION << "Please, make sure that Inference Engine IR readers library "
+                << FileUtils::fromFilePath(::FileUtils::makeSharedLibraryName({}, libraryName)) << " is in "
+                << getIELibraryPath();
+        }
+        cnnReaderLoader = std::shared_ptr<InferenceEngine::details::SharedObjectLoader>(
+            new InferenceEngine::details::SharedObjectLoader(irReadersLibraryPath.c_str()));
+    });
+
+    return cnnReaderLoader;
+}
+
 IInferencePluginAPI* getInferencePluginAPIInterface(IInferencePlugin* iplugin) {
     return dynamic_cast<IInferencePluginAPI*>(iplugin);
 }
@@ -52,6 +72,11 @@ IInferencePluginAPI* getInferencePluginAPIInterface(InferencePlugin plugin) {
 
 }  // namespace
 
+CNNNetReaderPtr CreateCNNNetReaderPtr() noexcept {
+    auto loader = createCnnReaderLoader();
+    return CNNNetReaderPtr(loader);
+}
+
 IE_SUPPRESS_DEPRECATED_END
 
 DeviceIDParser::DeviceIDParser(const std::string& deviceNameWithID) {
@@ -112,6 +137,7 @@ std::vector<std::string> DeviceIDParser::getMultiDevices(std::string devicesList
 }
 
 class Core::Impl : public ICore {
+    // Fields are ordered by deletion order
     ITaskExecutor::Ptr _taskExecutor = nullptr;
 
     IE_SUPPRESS_DEPRECATED_START
@@ -124,10 +150,11 @@ class Core::Impl : public ICore {
         std::vector<FileUtils::FilePath> listOfExtentions;
     };
 
-    std::map<std::string, PluginDescriptor> pluginRegistry;
     std::unordered_set<std::string> opsetNames;
     std::vector<IExtensionPtr> extensions;
 
+    std::map<std::string, PluginDescriptor> pluginRegistry;
+
 public:
     Impl();
     ~Impl() override;
@@ -395,12 +422,18 @@ std::map<std::string, Version> Core::GetVersions(const std::string& deviceName)
 
     {
         // for compatibility with samples / demo
-        if (deviceName.find("HETERO:") == 0) {
-            deviceNames = DeviceIDParser::getHeteroDevices(deviceName.substr(7));
+        if (deviceName.find("HETERO") == 0) {
+            auto pos = deviceName.find_first_of(":");
+            if (pos != std::string::npos) {
+                deviceNames = DeviceIDParser::getHeteroDevices(deviceName.substr(pos + 1));
+            }
             deviceNames.push_back("HETERO");
         } else if (deviceName.find("MULTI") == 0) {
+            auto pos = deviceName.find_first_of(":");
+            if (pos != std::string::npos) {
+                deviceNames = DeviceIDParser::getMultiDevices(deviceName.substr(pos + 1));
+            }
             deviceNames.push_back("MULTI");
-            deviceNames = DeviceIDParser::getMultiDevices(deviceName.substr(6));
         } else {
             deviceNames.push_back(deviceName);
         }
@@ -457,13 +490,12 @@ Parsed<T> parseDeviceNameIntoConfig(const std::string& deviceName, const std::ma
 CNNNetwork Core::ReadNetwork(const std::string& modelPath, const std::string& binPath) const {
     IE_PROFILING_AUTO_SCOPE(Core::ReadNetwork)
     IE_SUPPRESS_DEPRECATED_START
-    auto cnnReader = std::shared_ptr<ICNNNetReader>(CreateCNNNetReader());
     ResponseDesc desc;
+    CNNNetReaderPtr cnnReader(createCnnReaderLoader());
     StatusCode rt = cnnReader->ReadNetwork(modelPath.c_str(), &desc);
     if (rt != OK) THROW_IE_EXCEPTION << desc.msg;
-    auto cnnNetReaderImpl = std::dynamic_pointer_cast<details::CNNNetReaderImpl>(cnnReader);
-    if (cnnNetReaderImpl && cnnReader->getVersion(&desc) >= 10) {
-        cnnNetReaderImpl->addExtensions(_impl->getExtensions());
+    if (cnnReader->getVersion(&desc) >= 10) {
+        cnnReader->addExtensions(_impl->getExtensions());
     }
     std::string bPath = binPath;
     if (bPath.empty()) {
@@ -491,13 +523,12 @@ CNNNetwork Core::ReadNetwork(const std::string& modelPath, const std::string& bi
 CNNNetwork Core::ReadNetwork(const std::string& model, const Blob::CPtr& weights) const {
     IE_PROFILING_AUTO_SCOPE(Core::ReadNetwork)
     IE_SUPPRESS_DEPRECATED_START
-    auto cnnReader = std::shared_ptr<ICNNNetReader>(CreateCNNNetReader());
     ResponseDesc desc;
+    CNNNetReaderPtr cnnReader(createCnnReaderLoader());
     StatusCode rt = cnnReader->ReadNetwork(model.data(), model.length(), &desc);
     if (rt != OK) THROW_IE_EXCEPTION << desc.msg;
-    auto cnnNetReaderImpl = std::dynamic_pointer_cast<details::CNNNetReaderImpl>(cnnReader);
-    if (cnnNetReaderImpl && cnnReader->getVersion(&desc) >= 10) {
-        cnnNetReaderImpl->addExtensions(_impl->getExtensions());
+    if (cnnReader->getVersion(&desc) >= 10) {
+        cnnReader->addExtensions(_impl->getExtensions());
     }
     TBlob<uint8_t>::Ptr weights_ptr;
     if (weights) {
@@ -507,6 +538,7 @@ CNNNetwork Core::ReadNetwork(const std::string& model, const Blob::CPtr& weights
     rt = cnnReader->SetWeights(weights_ptr, &desc);
     if (rt != OK) THROW_IE_EXCEPTION << desc.msg;
     IE_SUPPRESS_DEPRECATED_END
+
     return CNNNetwork(cnnReader);
 }
 
@@ -694,11 +726,6 @@ void Core::SetConfig(const std::map<std::string, std::string>& config, const std
             THROW_IE_EXCEPTION << "SetConfig is supported only for HETERO itself (without devices). "
                                   "You can configure the devices with SetConfig before creating the HETERO on top.";
         }
-
-        if (config.find("TARGET_FALLBACK") != config.end()) {
-            THROW_IE_EXCEPTION << "Please, specify TARGET_FALLBACK to the LoadNetwork directly, "
-                                  "as you will need to pass the same TARGET_FALLBACK anyway.";
-        }
     }
 
     // MULTI case
@@ -707,11 +734,6 @@ void Core::SetConfig(const std::map<std::string, std::string>& config, const std
             THROW_IE_EXCEPTION << "SetConfig is supported only for MULTI itself (without devices). "
                                   "You can configure the devices with SetConfig before creating the MULTI on top.";
         }
-
-        if (config.find(MultiDeviceConfigParams::KEY_MULTI_DEVICE_PRIORITIES) != config.end()) {
-            THROW_IE_EXCEPTION << "Please, specify DEVICE_PRIORITIES to the LoadNetwork directly, "
-                                  "as you will need to pass the same DEVICE_PRIORITIES anyway.";
-        }
     }
 
     if (deviceName.empty()) {
index dded13a..58c893c 100644 (file)
@@ -13,6 +13,7 @@
 #include <ie_parameter.hpp>
 #include <ie_iextension.h>
 #include <ie_extension.h>
+
 #include <ngraph/opsets/opset.hpp>
 
 using namespace InferenceEngine;
@@ -83,6 +84,7 @@ template struct InferenceEngine::Parameter::RealData<std::vector<std::string>>;
 template struct InferenceEngine::Parameter::RealData<std::vector<unsigned long>>;
 template struct InferenceEngine::Parameter::RealData<std::tuple<unsigned int, unsigned int>>;
 template struct InferenceEngine::Parameter::RealData<std::tuple<unsigned int, unsigned int, unsigned int>>;
+template struct InferenceEngine::Parameter::RealData<InferenceEngine::Blob::Ptr>;
 #endif  // __clang__
 //
 // ie_blob.h
index f7aed75..337fbfa 100644 (file)
@@ -71,6 +71,13 @@ bool with_cpu_x86_avx512_core() {
 #endif
 }
 
+bool with_cpu_x86_bfloat16() {
+#ifdef ENABLE_MKL_DNN
+    return cpu.has(Xbyak::util::Cpu::tAVX512_BF16);
+#else
+    return false;
+#endif
+}
 
 bool checkOpenMpEnvVars(bool includeOMPNumThreads) {
     for (auto&& var : {
index 7a4c18d..f46ccbf 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 99d6a19..7543621 100644 (file)
@@ -12,6 +12,7 @@
 namespace InferenceEngine {
 
 ITaskExecutor::Ptr ExecutorManagerImpl::getExecutor(std::string id) {
+    std::lock_guard<std::mutex> guard(taskExecutorMutex);
     auto foundEntry = executors.find(id);
     if (foundEntry == executors.end()) {
         auto newExec = std::make_shared<CPUStreamsExecutor>(IStreamsExecutor::Config{id});
@@ -22,6 +23,7 @@ ITaskExecutor::Ptr ExecutorManagerImpl::getExecutor(std::string id) {
 }
 
 IStreamsExecutor::Ptr ExecutorManagerImpl::getIdleCPUStreamsExecutor(const IStreamsExecutor::Config& config) {
+    std::lock_guard<std::mutex> guard(streamExecutorMutex);
     for (const auto& it : cpuStreamsExecutors) {
         const auto& executor = it.second;
         if (executor.use_count() != 1)
@@ -52,6 +54,8 @@ size_t ExecutorManagerImpl::getIdleCPUStreamsExecutorsNumber() {
 }
 
 void ExecutorManagerImpl::clear(const std::string& id) {
+    std::lock_guard<std::mutex> stream_guard(streamExecutorMutex);
+    std::lock_guard<std::mutex> task_guard(taskExecutorMutex);
     if (id.empty()) {
         executors.clear();
         cpuStreamsExecutors.clear();
@@ -66,8 +70,47 @@ void ExecutorManagerImpl::clear(const std::string& id) {
     }
 }
 
+std::mutex ExecutorManager::_mutex;
 ExecutorManager* ExecutorManager::_instance = nullptr;
 
+ExecutorManager* ExecutorManager::getInstance() {
+    /*
+     * 1) We do not use singleton implementation via STATIC LOCAL object like
+     *
+     *   getInstance() {
+     *       static ExecutorManager _instance;
+     *       return &instance;
+     *   }
+     *
+     * Because of problem with destruction order on program exit.
+     * Some IE classes like MKLDNN::Engine use this singleton in destructor.
+     * But they has no direct dependency from c++ runtime point of view and
+     * it's possible that _instance local static variable  will be destroyed
+     * before MKLDNN::~Engine call. Any further manipulation with destroyed
+     * object will lead to exception or crashes.
+     *
+     * 2) We do not use singleton implementation via STATIC object like:
+     *
+     *   ExecutorManager ExecutorManager::_instance;
+     *   getInstance() {
+     *       return &instance;
+     *   }
+     *
+     * Because of problem with double destruction. In some test cases we use
+     * double link with IE module via static and dynamic version. Both modules
+     * have static object with same export name and it leads to double construction
+     * and double destruction of that object. For some c++ compilers (ex gcc 5.4)
+     * it lead to crash with "double free".
+     *
+     * That's why we use manual allocation of singleton instance on heap.
+     */
+    std::lock_guard<std::mutex> guard(_mutex);
+    if (_instance == nullptr) {
+        _instance = new ExecutorManager();
+    }
+    return _instance;
+}
+
 ITaskExecutor::Ptr ExecutorManager::getExecutor(std::string id) {
     return _impl.getExecutor(id);
 }
index 61104c0..0e04154 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/src/ir_readers/CMakeLists.txt b/inference-engine/src/ir_readers/CMakeLists.txt
new file mode 100644 (file)
index 0000000..f90b2ae
--- /dev/null
@@ -0,0 +1,49 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set(TARGET_NAME "inference_engine_ir_readers")
+
+if(ENABLE_LTO)
+    ie_enable_lto()
+endif()
+
+set(PUBLIC_HEADERS_DIR "${CMAKE_CURRENT_SOURCE_DIR}/")
+
+file(GLOB_RECURSE LIBRARY_SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+file(GLOB_RECURSE PUBLIC_HEADERS ${PUBLIC_HEADERS_DIR}/*.h ${PUBLIC_HEADERS_DIR}/*.hpp)
+
+# Create named folders for the sources within the .vcproj
+# Empty name lists them directly under the .vcproj
+
+source_group("src" FILES ${LIBRARY_SRC})
+source_group("include" FILES ${PUBLIC_HEADERS})
+
+# Create shared library
+
+add_library(${TARGET_NAME} SHARED ${LIBRARY_SRC} ${PUBLIC_HEADERS})
+
+target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_INFERENCE_ENGINE_API
+                                                  IMPLEMENT_INFERENCE_ENGINE_PLUGIN)
+
+target_include_directories(${TARGET_NAME} PUBLIC ${PUBLIC_HEADERS_DIR})
+target_include_directories(${TARGET_NAME} PRIVATE "${IE_MAIN_SOURCE_DIR}/src/inference_engine")
+
+target_link_libraries(${TARGET_NAME} PUBLIC inference_engine_plugin_api ${NGRAPH_LIBRARIES} inference_engine)
+target_link_libraries(${TARGET_NAME} PRIVATE pugixml)
+
+# code style
+
+add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
+add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME})
+
+# developer package
+
+ie_developer_export_targets(${TARGET_NAME})
+
+# install
+
+install(TARGETS ${TARGET_NAME}
+        RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+        ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+        LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
@@ -18,6 +18,7 @@
 #include "ie_format_parser.h"
 #include "ie_ir_reader.hpp"
 #include "ie_profiling.hpp"
+#include "ie_plugin.hpp"
 #include "parsers.h"
 #include "xml_parse_utils.h"
 
@@ -29,21 +30,19 @@ IE_SUPPRESS_DEPRECATED_START
 CNNNetReaderImpl::CNNNetReaderImpl(const FormatParserCreator::Ptr& _creator)
     : parseSuccess(false), _version(0), parserCreator(_creator) {}
 
+CNNNetReaderImpl::~CNNNetReaderImpl() { }
+
 StatusCode CNNNetReaderImpl::SetWeights(const TBlob<uint8_t>::Ptr& weights, ResponseDesc* desc) noexcept {
     if (!_parser && _version < 10) {
         return DescriptionBuffer(desc) << "network must be read first";
     }
     try {
         if (_version == 10) {
-#if defined(ENABLE_IR_READER)
             // It's time to perform actual reading of V10 network and instantiate CNNNetworkNGraphImpl
             IRReader v10Reader(extensions);
             std::stringstream model;
             xmlDoc->save(model);
             network = std::make_shared<CNNNetworkNGraphImpl>(v10Reader.read(model.str(), weights));
-#else
-            return DescriptionBuffer(desc) << "Please, recompile Inference Engine with the ENABLE_IR_READER=ON Cmake option";
-#endif
         } else {
             _parser->SetWeights(weights);
         }
@@ -173,15 +172,13 @@ void CNNNetReaderImpl::addExtensions(const std::vector<InferenceEngine::IExtensi
 }
 
 std::shared_ptr<IFormatParser> V2FormatParserCreator::create(size_t version) {
-#ifdef ENABLE_IR_READER
     return std::make_shared<FormatParser>(version);
-#else
-    THROW_IE_EXCEPTION << "Please, recompile Inference Engine library with the ENABLE_IR_READER=ON Cmake option";
-    return nullptr;
-#endif
 }
 
-InferenceEngine::ICNNNetReader* InferenceEngine::CreateCNNNetReader() noexcept {
-    return new CNNNetReaderImpl(std::make_shared<V2FormatParserCreator>());
+INFERENCE_PLUGIN_API(InferenceEngine::StatusCode)
+CreateICNNNetReader(ICNNNetReader *& data, ResponseDesc *resp) noexcept {
+    data = new CNNNetReaderImpl(std::make_shared<V2FormatParserCreator>());
+    return StatusCode::OK;
 }
+
 IE_SUPPRESS_DEPRECATED_END
 #include <vector>
 
 #include "cnn_network_impl.hpp"
-#include "ie_icnn_net_reader.h"
 #include "ie_memcpy.h"
 #include "ie_profiling.hpp"
 #include "parsers.h"
+#include "ie_util_internal.hpp"
 
 namespace pugi {
 class xml_node;
@@ -31,14 +31,14 @@ struct FormatParserCreator {
     virtual ~FormatParserCreator() = default;
 };
 
-struct V2FormatParserCreator : public FormatParserCreator {
+struct INFERENCE_ENGINE_API_CLASS(V2FormatParserCreator) : public FormatParserCreator {
     std::shared_ptr<IFormatParser> create(size_t version) override;
 };
 
 IE_SUPPRESS_DEPRECATED_START
-class CNNNetReaderImpl : public ICNNNetReader {
+class INFERENCE_ENGINE_API_CLASS(CNNNetReaderImpl) : public ICNNNetReader {
 public:
-    explicit CNNNetReaderImpl(const FormatParserCreator::Ptr& _parserCreator);
+    explicit CNNNetReaderImpl(const FormatParserCreator::Ptr& _creator);
 
     StatusCode ReadNetwork(const char* filepath, ResponseDesc* resp) noexcept override;
 
@@ -78,7 +78,9 @@ public:
         delete this;
     }
 
-    void addExtensions(const std::vector<InferenceEngine::IExtensionPtr>& ext);
+    void addExtensions(const std::vector<InferenceEngine::IExtensionPtr>& ext) override;
+
+    ~CNNNetReaderImpl() override;
 
 private:
     std::shared_ptr<InferenceEngine::details::IFormatParser> _parser;
@@ -96,6 +98,7 @@ private:
     std::shared_ptr<pugi::xml_document> xmlDoc;
     std::vector<InferenceEngine::IExtensionPtr> extensions;
 };
+
 IE_SUPPRESS_DEPRECATED_END
 
 }  // namespace details
@@ -267,9 +267,10 @@ FormatParser::FormatParser(size_t version): _version(version) {
                 std::make_shared<LayerCreator<TopKLayer>>("TopK"),
                 std::make_shared<LayerCreator<UniqueLayer>>("Unique"),
                 std::make_shared<LayerCreator<NonMaxSuppressionLayer>>("NonMaxSuppression"),
-                std::make_shared<LayerCreator<ScatterLayer>>("ScatterUpdate"),
+                std::make_shared<LayerCreator<ScatterUpdateLayer>>("ScatterUpdate"),
                 std::make_shared<LayerCreator<ExperimentalDetectronPriorGridGeneratorLayer>>("ExperimentalDetectronPriorGridGenerator"),
-                std::make_shared<LayerCreator<ExperimentalDetectronGenerateProposalsSingleImageLayer>>("ExperimentalDetectronGenerateProposalsSingleImage")};
+                std::make_shared<LayerCreator<ExperimentalDetectronGenerateProposalsSingleImageLayer>>("ExperimentalDetectronGenerateProposalsSingleImage"),
+                std::make_shared<LayerCreator<ExperimentalDetectronTopKROIs>>("ExperimentalDetectronTopKROIs")};
     creators.emplace_back(_version < 6 ? std::make_shared<LayerCreator<QuantizeLayer>>("Quantize")
                                        : std::make_shared<LayerCreator<QuantizeLayer>>("FakeQuantize"));
 }
@@ -71,11 +71,7 @@ public:
     }
 };
 
-#ifdef ENABLE_IR_READER
 class INFERENCE_ENGINE_API_CLASS(FormatParser): public IFormatParser {
-#else
-class FormatParser : public IFormatParser {
-#endif
 public:
     explicit FormatParser(size_t version);
 
@@ -187,7 +187,7 @@ V10Parser::GenericLayerParams V10Parser::parseGenericParams(const pugi::xml_node
         // Input port hasn't precision
         if (!input) {
             const std::string& preStr = GetStrAttr(parentNode, "precision");
-            type = InferenceEngine::details::ngraph::convertPrecision(preStr);
+            type = InferenceEngine::details::convertPrecision(preStr);
         }
         port.precision = type;
         return port;
@@ -413,7 +413,7 @@ std::shared_ptr<ngraph::Node> V10Parser::createNode(const std::vector<ngraph::Ou
         for (const auto& port : params.outputPorts) {
             ngraph::op::GenericIE::PortIE iePort;
             iePort.dims = port.dims;
-            iePort.precision = InferenceEngine::details::ngraph::convertPrecision(port.precision);
+            iePort.precision = InferenceEngine::details::convertPrecision(port.precision);
             outputs.emplace_back(iePort);
         }
 
@@ -767,7 +767,7 @@ std::shared_ptr<ngraph::Node> V10Parser::LayerCreator<ngraph::op::Convert>::crea
         THROW_IE_EXCEPTION << "Cannot read parameter for " << getType() << " layer with name: " << layerParsePrms.name;
 
     return std::make_shared<ngraph::op::Convert>(inputs[0],
-                                                 details::ngraph::convertPrecision(GetStrAttr(dn, "destination_type")));
+                                                 details::convertPrecision(GetStrAttr(dn, "destination_type")));
 }
 
 // LSTMCell layer
@@ -192,7 +192,7 @@ private:
             std::string val;
             if (!getStrAttribute(node.child("data"), name, val)) return;
             if (auto a = ngraph::as_type<ngraph::AttributeAdapter<ngraph::element::Type>>(&adapter)) {
-                static_cast<ngraph::element::Type&>(*a) = details::ngraph::convertPrecision(val);
+                static_cast<ngraph::element::Type&>(*a) = details::convertPrecision(val);
             } else if (auto a = ngraph::as_type<ngraph::AttributeAdapter<ngraph::PartialShape>>(&adapter)) {
                 std::vector<int64_t> shape;
                 std::vector<ngraph::Dimension> dims;
@@ -36,11 +36,7 @@ namespace InferenceEngine {
  * All methods here do not throw exceptions and return a StatusCode and ResponseDesc object.
  * Alternatively, to use methods that throw exceptions, refer to the CNNNetReader wrapper class.
  */
-#ifdef ENABLE_IR_READER
 class INFERENCE_ENGINE_API_CLASS(IRReader) {
-#else
-class IRReader {
-#endif
 public:
     IRReader() = default;
     explicit IRReader(const std::vector<IExtensionPtr>& exts): extensions(exts) {}
index a03a5f2..daae95a 100644 (file)
@@ -32,6 +32,8 @@ set_ie_threading_interface_for(${TARGET_NAME}_obj)
 target_compile_definitions(${TARGET_NAME}_obj PRIVATE IMPLEMENT_INFERENCE_ENGINE_API)
 
 target_include_directories(${TARGET_NAME}_obj PRIVATE ${PUBLIC_HEADERS_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src
+    ${IE_MAIN_SOURCE_DIR}/src/inference_engine                                          # For CNNNetworkNGraphImpl
+    $<TARGET_PROPERTY:inference_engine_transformations,INTERFACE_INCLUDE_DIRECTORIES>
     $<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>
     $<TARGET_PROPERTY:ngraph::ngraph,INTERFACE_INCLUDE_DIRECTORIES>
     $<TARGET_PROPERTY:pugixml,INTERFACE_INCLUDE_DIRECTORIES>)
@@ -51,10 +53,9 @@ add_library(${TARGET_NAME} SHARED
 
 set_ie_threading_interface_for(${TARGET_NAME})
 
-target_link_libraries(${TARGET_NAME} PRIVATE ${NGRAPH_LIBRARIES} pugixml)
+target_link_libraries(${TARGET_NAME} PRIVATE ${NGRAPH_LIBRARIES} inference_engine_transformations pugixml)
 
 add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
-add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME})
 
 # export targets
 
@@ -67,6 +68,6 @@ ie_developer_export_targets(${TARGET_NAME})
 # install
 
 install(TARGETS ${TARGET_NAME}
-        RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
-        ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+        RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+        ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
         LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
index 617df27..fe3bd5e 100644 (file)
@@ -40,6 +40,10 @@ public:
         precision = prec;
     }
 
+    std::shared_ptr<::ngraph::Function> getFunction() noexcept override {
+        return nullptr;
+    }
+
     std::shared_ptr<const ::ngraph::Function> getFunction() const noexcept override {
         return nullptr;
     }
@@ -4,13 +4,18 @@
 
 #pragma once
 
-#include "cnn_network_ngraph_impl.hpp"
+#include "cnn_network_impl.hpp"
+#include <ngraph/attribute_visitor.hpp>
 
 #include <memory>
+#include <string>
+#include <vector>
 
 namespace InferenceEngine {
 namespace details {
+
 INFERENCE_ENGINE_API_CPP(std::shared_ptr<CNNNetworkImpl>)
-convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph, const CNNNetworkNGraphImpl &nGraphImpl);
+convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph, const ICNNNetwork &network);
+
 }  // namespace details
 }  // namespace InferenceEngine
index c96aa11..7de32ff 100644 (file)
@@ -24,6 +24,7 @@ namespace InferenceEngine {
  */
 class INFERENCE_ENGINE_API_CLASS(ConstTransformer) {
 public:
+    explicit ConstTransformer(ICNNNetwork* _network);
     explicit ConstTransformer(details::CNNNetworkImpl* _network);
     explicit ConstTransformer(std::vector<DataPtr> &_inputs, std::vector<DataPtr> &_outputs);
 
@@ -11,7 +11,6 @@
 
 namespace InferenceEngine {
 namespace details {
-namespace ngraph {
 
 inline ::ngraph::element::Type convertPrecision(const Precision& precision) {
     Precision::ePrecision pType = precision;
@@ -22,6 +21,8 @@ inline ::ngraph::element::Type convertPrecision(const Precision& precision) {
         return ::ngraph::element::Type(::ngraph::element::Type_t::f32);
     case Precision::FP16:
         return ::ngraph::element::Type(::ngraph::element::Type_t::f16);
+    case Precision::BF16:
+        return ::ngraph::element::Type(::ngraph::element::Type_t::bf16);
     case Precision::U8:
         return ::ngraph::element::Type(::ngraph::element::Type_t::u8);
     case Precision::I8:
@@ -53,6 +54,8 @@ inline ::ngraph::element::Type convertPrecision(const std::string& precision) {
         return ::ngraph::element::Type(::ngraph::element::Type_t::f16);
     } else if (precision == "f32" || precision == "FP32") {
         return ::ngraph::element::Type(::ngraph::element::Type_t::f32);
+    } else if (precision == "bf16" || precision == "BF16") {
+        return ::ngraph::element::Type(::ngraph::element::Type_t::bf16);
     } else if (precision == "f64" || precision == "FP64") {
         return ::ngraph::element::Type(::ngraph::element::Type_t::f64);
     } else if (precision == "i8" || precision == "I8") {
@@ -90,6 +93,8 @@ inline Precision convertPrecision(const ::ngraph::element::Type& precision) {
         return Precision(Precision::FP16);
     case ::ngraph::element::Type_t::f32:
         return Precision(Precision::FP32);
+    case ::ngraph::element::Type_t::bf16:
+        return Precision(Precision::BF16);
     case ::ngraph::element::Type_t::i8:
         return Precision(Precision::I8);
     case ::ngraph::element::Type_t::i16:
@@ -113,6 +118,5 @@ inline Precision convertPrecision(const ::ngraph::element::Type& precision) {
     }
 }
 
-}  // namespace ngraph
 }  // namespace details
 }  // namespace InferenceEngine
index 8eaf449..c22e41a 100644 (file)
@@ -6,6 +6,7 @@
 
 #include <cpp/ie_cnn_network.h>
 
+#include <ie_icnn_network.hpp>
 #include <cnn_network_impl.hpp>
 #include <file_utils.h>
 #include <deque>
@@ -50,6 +51,15 @@ cloneNet(const std::vector<InferenceEngine::CNNLayerPtr>& layers, const ICNNNetw
 IE_SUPPRESS_DEPRECATED_END
 
 /**
+ * @brief Clones the whole network without conversion to CNNNetworkImpl. All layers and data objects will be cloned
+ * @note Blobs inside layers are reused
+ * @param network A network to clone
+ * @return A cloned object
+ */
+INFERENCE_ENGINE_API_CPP(std::shared_ptr<InferenceEngine::ICNNNetwork>)
+cloneNetwork(const InferenceEngine::ICNNNetwork& network);
+
+/**
  * @brief Clones the whole network. All layers and data objects will be cloned
  * @note Blobs inside layers are reused
  * @param network A network to clone
index 3759bb3..cd2e6c3 100644 (file)
@@ -33,8 +33,8 @@ using AllLayers =
                ReshapeLayer*, TileLayer*, ScaleShiftLayer*, PReLULayer*, PowerLayer*, BatchNormalizationLayer*,
                ClampLayer*, TensorIterator*, LSTMCell*, GRUCell*, RNNCell*, RNNSequenceLayer*, QuantizeLayer*,
                BinaryConvolutionLayer*, WeightableLayer*, OneHotLayer*, MathLayer*, ReduceLayer*, UniqueLayer*,
-               NonMaxSuppressionLayer*, ScatterLayer*, ExperimentalDetectronPriorGridGeneratorLayer*,
-               ExperimentalDetectronGenerateProposalsSingleImageLayer*, CNNLayer*>;
+               NonMaxSuppressionLayer*, ScatterUpdateLayer*, ExperimentalDetectronPriorGridGeneratorLayer*,
+               ExperimentalDetectronGenerateProposalsSingleImageLayer*, ExperimentalDetectronTopKROIs*, CNNLayer*>;
 
 template <class Visitor, std::size_t I = 0, typename... Tp>
 inline typename std::enable_if<I == sizeof...(Tp), void>::type visitActualLayer(std::tuple<Tp...>&& t,
index f01b647..04830f3 100644 (file)
@@ -1000,7 +1000,9 @@ void CNNNetworkInt8Normalizer::QuantizeConvolutionOrFullyConnected(CNNLayer::Ptr
                 }
                 prev = *it;
             }
-            symQuant = *(intervals.begin());
+            if (!intervals.empty()) {
+                symQuant = *(intervals.begin());
+            }
             std::set<double> divs;
             prev = 0.f;
             for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
@@ -9,6 +9,7 @@
 #include <vector>
 #include <unordered_set>
 
+#include <cnn_network_ngraph_impl.hpp>
 #include "ngraph_ops/convolution_ie.hpp"
 #include "ngraph_ops/deconvolution_ie.hpp"
 #include "ngraph_ops/eltwise.hpp"
 #include "ie_profiling.hpp"
 #include "ie_cnn_layer_builder_ngraph.h"
 
+#include <debug.h>
 #include "transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.hpp"
 #include "transformations/utils/utils.hpp"
 
 namespace InferenceEngine {
 namespace details {
-std::shared_ptr<CNNNetworkImpl> convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph, const CNNNetworkNGraphImpl &nGraphImpl) {
+
+/**
+ * @brief Creator for CNNLayer from nGraph op
+ */
+class CNNLayerCreator : public ::ngraph::AttributeVisitor {
+public:
+    using CreatorFor = std::function<CNNLayerPtr(const std::shared_ptr<::ngraph::Node>& node,
+                                                 const std::map<std::string, std::string> param)>;
+    explicit CNNLayerCreator(const std::shared_ptr<::ngraph::Node>& node);
+
+    CNNLayerPtr create();
+
+    void on_attribute(const std::string& name, std::string& value) override {
+        params[name] = value;
+    }
+
+    void on_attribute(const std::string& name, bool& value) override {
+        params[name] = value ? "true" : "false";
+    }
+
+    void addSpecificCreator(const std::vector<std::string>& forTypes, const CreatorFor& creator) {
+        for (const auto type : forTypes) {
+            creators[type] = creator;
+        }
+    }
+
+    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<std::string>& adapter) override {
+        std::string data = adapter.get();
+        std::transform(data.begin(), data.end(), data.begin(), [](unsigned char c) {
+            return std::tolower(c);
+        });
+        params[name] = data;
+    }
+
+    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<std::vector<int64_t>>& adapter) override {
+        auto shape = adapter.get();
+        params[name] = joinVec(shape);
+    }
+
+    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<double>& adapter) override {
+        params[name] = std::to_string(adapter.get());
+    }
+
+    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<int64_t>& adapter) override {
+        params[name] = std::to_string(adapter.get());
+    }
+
+    void on_adapter(const std::string& name, ::ngraph::ValueAccessor<void>& adapter) override;
+
+private:
+    std::shared_ptr<::ngraph::Node> node;
+    std::map<std::string, std::string> params;
+    std::map<std::string, CreatorFor> creators;
+};
+
+void InferenceEngine::details::CNNLayerCreator::on_adapter(const std::string& name,
+                                                           ::ngraph::ValueAccessor<void>& adapter) {
+    if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::element::Type>>(&adapter)) {
+        auto type = static_cast<::ngraph::element::Type&>(*a);
+        params[name] = details::convertPrecision(type).name();
+    } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::PartialShape>>(&adapter)) {
+        std::string dims;
+        auto shape = static_cast<::ngraph::PartialShape&>(*a);
+        for (size_t i = 0; i < shape.rank().get_length(); i++) {
+            if (!dims.empty()) dims += ",";
+            dims += std::to_string(shape[i].get_length());
+        }
+        params[name] = dims;
+    } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::Shape>>(&adapter)) {
+        auto shape = static_cast<::ngraph::Shape&>(*a);
+        params[name] = joinVec(shape);
+    } else if (auto a = ::ngraph::as_type<::ngraph::AttributeAdapter<::ngraph::Strides>>(&adapter)) {
+        auto shape = static_cast<::ngraph::Strides&>(*a);
+        params[name] = joinVec(shape);
+    }
+}
+
+InferenceEngine::details::CNNLayerCreator::CNNLayerCreator(const std::shared_ptr<::ngraph::Node>& node): node(node) {
+    addSpecificCreator({"Parameter"}, [](const std::shared_ptr<::ngraph::Node>& node,
+                                         const std::map<std::string, std::string> params) -> CNNLayerPtr {
+        LayerParams attrs = {node->get_friendly_name(), "Input",
+            details::convertPrecision(node->get_output_element_type(0))};
+        auto res = std::make_shared<CNNLayer>(attrs);
+        return res;
+    });
+    // TODO - Remove "GreaterEq" once ngraph transitions to GreaterEqual
+    addSpecificCreator({"Eltwise", "Subtract", "Power", "Maximum", "Divide", "Greater", "GreaterEqual", "FloorMod", "LogicalOr", "LogicalAnd", "LogicalXor",
+        "GreaterEq", "Less", "LessEqual", "Equal", "NotEqual", "Multiply", "Add"}, [](const std::shared_ptr<::ngraph::Node>& node,
+                                                                 const std::map<std::string, std::string> params) -> CNNLayerPtr {
+            LayerParams attrs = {node->get_friendly_name(), "Eltwise",
+                details::convertPrecision(node->get_output_element_type(0))};
+            auto res = std::make_shared<EltwiseLayer>(attrs);
+            res->params = params;
+            if (node->description() == "Maximum") {
+                res->params["operation"] = "max";
+            } else if (node->description() == "Power") {
+                res->params["operation"] = "pow";
+            } else if (node->description() == "Subtract") {
+                res->params["operation"] = "sub";
+            } else if (node->description() == "Divide") {
+                res->params["operation"] = "div";
+            } else if (node->description() == "LessEqual") {
+                res->params["operation"] = "less_equal";
+            } else if (node->description() == "Less") {
+                res->params["operation"] = "less";
+            } else if (node->description() == "Equal") {
+                res->params["operation"] = "equal";
+            } else if (node->description() == "NotEqual") {
+                res->params["operation"] = "not_equal";
+            } else if (node->description() == "FloorMod") {
+                res->params["operation"] = "floor_mod";
+            } else if (node->description() == "Multiply") {
+                res->params["operation"] = "prod";
+            } else if (node->description() == "Add") {
+                res->params["operation"] = "sum";
+            } else if (node->description() == "Greater") {
+                res->params["operation"] = "greater";
+            } else if (node->description() == "GreaterEq") {
+                res->params["operation"] = "greater_equal";
+            } else if (node->description() == "GreaterEqual") {
+                res->params["operation"] = "greater_equal";
+            } else if (node->description() == "LogicalOr") {
+                res->params["operation"] = "logical_or";
+            } else if (node->description() == "LogicalAnd") {
+                res->params["operation"] = "logical_and";
+            } else if (node->description() == "LogicalXor") {
+                res->params["operation"] = "logical_xor";
+            } else if (node->description() == "Eltwise") {
+                auto castedLayer = std::dynamic_pointer_cast<::ngraph::op::Eltwise>(node);
+                if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << attrs.type << " layer " << attrs.name;
+                std::string type;
+                switch (castedLayer->eltwise_type) {
+                case ELTWISE_TYPE::Sum:
+                    type = "sum";
+                    break;
+                case ELTWISE_TYPE::Prod:
+                    type = "prod";
+                    break;
+                default:
+                    THROW_IE_EXCEPTION << "Not supported eltwise type!";
+                }
+
+                res->params["operation"] = type;
+            }
+            return res;
+        });
+    addSpecificCreator({"Concat"}, [](const std::shared_ptr<::ngraph::Node>& node,
+                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
+        LayerParams attrs = {node->get_friendly_name(), node->description(),
+            details::convertPrecision(node->get_output_element_type(0))};
+        auto res = std::make_shared<ConcatLayer>(attrs);
+        res->params = params;
+        return res;
+    });
+    addSpecificCreator({"AvgPool", "MaxPool"}, [](const std::shared_ptr<::ngraph::Node>& node,
+                                                  const std::map<std::string, std::string> params) -> CNNLayerPtr {
+        LayerParams attrs = {node->get_friendly_name(), "Pooling",
+            details::convertPrecision(node->get_output_element_type(0))};
+        auto res = std::make_shared<PoolingLayer>(attrs);
+        res->params = params;
+        if (res->params.find("auto_pad") != res->params.end() &&
+            details::CaselessEq<std::string>()(res->params["auto_pad"], "EXPLICIT"))
+            res->params.erase("auto_pad");
+
+        if (res->params.find("exclude_pad") != res->params.end()) {
+            res->params["exclude-pad"] = res->params["exclude_pad"];
+            res->params.erase("exclude_pad");
+        }
+
+        if (node->description() == "MaxPool") {
+            res->params["pool-method"] = "max";
+        } else if (node->description() == "AvgPool") {
+            res->params["pool-method"] = "avg";
+        }
+        return res;
+    });
+    addSpecificCreator({"Select"}, [](const std::shared_ptr<::ngraph::Node>& node,
+                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
+        LayerParams attrs = {node->get_friendly_name(), node->description(),
+                             details::convertPrecision(node->get_output_element_type(0))};
+        auto res = std::make_shared<SelectLayer>(attrs);
+        res->params = params;
+        return res;
+    });
+    addSpecificCreator({"BinaryConvolution"}, [](const std::shared_ptr<::ngraph::Node>& node,
+                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
+        LayerParams attrs = {node->get_friendly_name(), node->description(),
+                             details::convertPrecision(node->get_output_element_type(0))};
+        auto res = std::make_shared<BinaryConvolutionLayer>(attrs);
+
+        // todo: investigate difference between ngraph parameters for BinConvolution and the implementation above
+        // this leads to accuracy issue for Precollected_ONNX_ResNet50_88percentinto1bit e2e test
+        // res->params = params;
+
+        auto castedLayer = ::ngraph::as_type_ptr<::ngraph::op::v1::BinaryConvolution>(node);
+
+        std::string value;
+        for (const auto& val : castedLayer->get_pads_begin()) {
+            if (!value.empty()) value += ",";
+            value += Builder::asString(val);
+        }
+        res->params["pads_begin"] = value;
+
+        value.clear();
+        for (const auto& val : castedLayer->get_pads_end()) {
+            if (!value.empty()) value += ",";
+            value += Builder::asString(val);
+        }
+        res->params["pads_end"] = value;
+
+        switch (castedLayer->get_auto_pad()) {
+            case ::ngraph::op::PadType::SAME_UPPER:
+                res->params["auto_pad"] = "same_upper";
+                break;
+            case ::ngraph::op::PadType::SAME_LOWER:
+                res->params["auto_pad"] = "same_lower";
+                break;
+            case ::ngraph::op::PadType::VALID:
+                res->params["auto_pad"] = "valid";
+                break;
+            default:
+                break;
+        }
+
+        value.clear();
+        for (const auto& val : castedLayer->get_strides()) {
+            if (!value.empty()) value += ",";
+            value += Builder::asString(val);
+        }
+        res->params["strides"] = value;
+
+        value.clear();
+        for (const auto& val : castedLayer->get_dilations()) {
+            if (!value.empty()) value += ",";
+            value += Builder::asString(val);
+        }
+        res->params["dilations"] = value;
+
+        // Restore kernel size and output
+        const auto& shape = castedLayer->get_input_shape(1);
+        res->params["output"] = Builder::asString(shape[0]);
+
+        value.clear();
+        for (size_t i = 2; i < shape.size(); i++) {
+            if (!value.empty()) value += ",";
+            value += Builder::asString(shape[i]);
+        }
+        res->params["kernel"] = value;
+
+        switch (castedLayer->get_mode()) {
+            case ::ngraph::op::v1::BinaryConvolution::BinaryConvolutionMode::XNOR_POPCOUNT:
+                res->params["mode"] = "xnor-popcount";
+        }
+
+        auto weights_shape = castedLayer->input(1).get_source_output().get_shape();
+        res->params["input"] = Builder::asString(weights_shape[1]);
+        res->params["pad_value"] = Builder::asString(castedLayer->get_pad_value());
+
+        Builder::NodeConverter<::ngraph::op::Constant> converter;
+
+        const auto weightsNode = castedLayer->get_inputs()[1].get_output().get_node();
+        if (converter.canCreate(weightsNode)) {
+            const auto& weights = converter.createLayer(weightsNode);
+            res->blobs["weights"] = weights->blobs["custom"];
+            res->_weights = weights->blobs["custom"];
+        }
+        return res;
+    });
+
+    addSpecificCreator({"SpaceToBatch"}, [](const std::shared_ptr<::ngraph::Node>& node,
+                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
+        LayerParams attrs = {node->get_friendly_name(), node->description(),
+                             details::convertPrecision(node->get_output_element_type(0))};
+        auto res = std::make_shared<SpaceToBatchLayer>(attrs);
+        res->params = params;
+        return res;
+    });
+
+    addSpecificCreator({"BatchToSpace"}, [](const std::shared_ptr<::ngraph::Node>& node,
+                                      const std::map<std::string, std::string> params) -> CNNLayerPtr {
+        LayerParams attrs = {node->get_friendly_name(), node->description(),
+                             details::convertPrecision(node->get_output_element_type(0))};
+        auto res = std::make_shared<BatchToSpaceLayer>(attrs);
+        res->params = params;
+        return res;
+    });
+}
+
+CNNLayerPtr InferenceEngine::details::CNNLayerCreator::create() {
+    auto one_from = [](const std::string& desc, const std::vector<std::string>& descs) -> bool {
+        for (const auto& d : descs) {
+            if (details::CaselessEq<std::string>()(d, desc)) return true;
+        }
+        return false;
+    };
+    LayerParams attrs = {node->get_friendly_name(), node->description(),
+                         details::convertPrecision(node->get_output_element_type(0))};
+    if (creators.find(node->description()) != creators.end())
+        return creators[node->description()](node, params);
+
+    auto res = std::make_shared<CNNLayer>(attrs);
+    res->params = params;
+    return res;
+}
+
+std::shared_ptr<CNNNetworkImpl> convertFunctionToICNNNetwork(const std::shared_ptr<const ::ngraph::Function>& graph, const ICNNNetwork &network) {
     IE_PROFILING_AUTO_SCOPE(convertFunctionToICNNNetwork)
     const auto createCNNLayer = [](const std::shared_ptr<::ngraph::Node> &node) -> CNNLayerPtr {
         class NGraphCNNLayer: public CNNLayer {
@@ -240,8 +547,10 @@ std::shared_ptr<CNNNetworkImpl> convertFunctionToICNNNetwork(const std::shared_p
         network->setInputInfo(info);
     };
 
+    const CNNNetworkNGraphImpl* nGraphImpl = dynamic_cast<const CNNNetworkNGraphImpl*>(&network);
+
     InputsDataMap thisInputDataMap;
-    nGraphImpl.getInputsInfo(thisInputDataMap);
+    network.getInputsInfo(thisInputDataMap);
 
     // Create network
     auto cnnNetworkImpl = std::make_shared<details::CNNNetworkImpl>();
@@ -295,25 +604,25 @@ std::shared_ptr<CNNNetworkImpl> convertFunctionToICNNNetwork(const std::shared_p
             for (const auto &dim : dims) {
                 if (!dim)
                     THROW_IE_EXCEPTION << cnnLayer->type << " layer " << cnnLayer->name
-                                       << " has incorrect dimensions in the output data " << i;
+                        << " has incorrect dimensions in the output data " << i;
             }
-
-            if (!ptr && nGraphImpl._data.find(outName) != nGraphImpl._data.end()) {
-                ptr = nGraphImpl._data.at(outName);
+            if (!ptr && nGraphImpl && nGraphImpl->_data.find(outName) != nGraphImpl->_data.end()) {
+                ptr = nGraphImpl->_data.at(outName);
                 if (auto nData = std::dynamic_pointer_cast<InferenceEngine::details::NGraphData>(ptr)) {
                     const auto layout =
-                            dims.size() == nData->getTensorDesc().getDims().size() ?
-                            nData->getTensorDesc().getLayout() :
-                            TensorDesc::getLayoutByDims(dims);
+                        dims.size() == nData->getTensorDesc().getDims().size() ?
+                        nData->getTensorDesc().getLayout() :
+                        TensorDesc::getLayoutByDims(dims);
 
                     nData->reset();
                     nData->reshape(dims, layout);
                 }
                 cnnNetworkImpl->addData(outName.c_str(), ptr);
             }
+
             if (!ptr) {
                 ptr.reset(new Data(outName,
-                                   {details::ngraph::convertPrecision(layer->get_output_element_type(i)), dims,
+                                   {details::convertPrecision(layer->get_output_element_type(i)), dims,
                                     TensorDesc::getLayoutByDims(dims)}));
             }
 
index 3b0be37..cb5afbf 100644 (file)
@@ -17,6 +17,7 @@
 #include <vector>
 #include <mutex>
 
+#include <cnn_network_ngraph_impl.hpp>
 #include "blob_factory.hpp"
 #include "cnn_network_impl.hpp"
 #include "graph_tools.hpp"
@@ -70,6 +71,19 @@ ConstTransformer::ConstTransformer(details::CNNNetworkImpl* _network)
         THROW_IE_EXCEPTION << "[ERROR]: Failed to init ConstTransformer with null pointer of network";
 }
 
+ConstTransformer::ConstTransformer(ICNNNetwork* _network) {
+    if (auto cnnNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(_network)) {
+        network = cnnNet;
+    } else if (auto nGraphNet = dynamic_cast<InferenceEngine::details::CNNNetworkNGraphImpl *>(_network)) {
+        if (auto cnnNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(nGraphNet->getCNNNetwork().get()))
+            network = cnnNet;
+    }
+    if (!network)
+        THROW_IE_EXCEPTION << "[ERROR]: Failed to init ConstTransformer with unsupported network type";
+    inputs = get_inputs(network);
+    outputs = get_outputs(network);
+}
+
 ConstTransformer::ConstTransformer(std::vector<DataPtr> &_inputs, std::vector<DataPtr> &_outputs)
         : inputs(_inputs), outputs(_outputs), network(nullptr) {
     if (inputs.empty() || outputs.empty())
@@ -3,7 +3,7 @@
 //
 
 #include <ie_cnn_layer_builder_ngraph.h>
-#include "cnn_network_ngraph_impl.hpp"
+#include <cnn_network_ngraph_impl.hpp>
 #include <precision_utils.h>
 #include <cpp/ie_cnn_network.h>
 
@@ -72,7 +72,7 @@ std::string asString<float>(const float& value) {
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Abs>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Abs",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -83,7 +83,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::GenericIE>::createLayer(const std::share
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get layer " << layer->get_friendly_name();
 
     LayerParams params = {layer->get_friendly_name(), castedLayer->getType(),
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     if (castedLayer->getType() == "RNNCell")
         res = std::make_shared<InferenceEngine::RNNCell>(params);
@@ -232,7 +232,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::TensorIterator>::createLayer(const std::
 
     // Create Inference Engine representation of TensorIterator
     LayerParams params = {layer->get_friendly_name(), "TensorIterator",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::TensorIterator>(params);
 
     // Body: inputs
@@ -365,7 +365,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::TensorIterator>::createLayer(const std::
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Constant>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Const",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::Constant>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -377,9 +377,9 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Constant>::createLayer(const std::shared
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Convert>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Convert",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
-    auto p = details::ngraph::convertPrecision(layer->get_output_element_type(0));
+    auto p = details::convertPrecision(layer->get_output_element_type(0));
     std::string precision_str;
     switch (p) {
     case Precision::FP16:
@@ -423,7 +423,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Convert>::createLayer(const std::shared_
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Ceiling>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Ceiling",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -431,7 +431,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Ceiling>::createLayer(const std::shared_
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Floor>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Floor",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -439,7 +439,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Floor>::createLayer(const std::shared_pt
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Sigmoid>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Sigmoid",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -447,7 +447,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Sigmoid>::createLayer(const std::shared_
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Tanh>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "TanH",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -455,7 +455,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Tanh>::createLayer(const std::shared_ptr
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Relu>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ReLU",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReLULayer>(params);
     return res;
 }
@@ -463,7 +463,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Relu>::createLayer(const std::shared_ptr
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::SeluIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Selu",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
 
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::SeluIE>(layer);
@@ -477,7 +477,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::SeluIE>::createLayer(const std::shared_p
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::ReLUIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ReLU",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReLULayer>(params);
 
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::ReLUIE>(layer);
@@ -490,7 +490,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::ReLUIE>::createLayer(const std::shared_p
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Range>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Range",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -498,7 +498,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Range>::createLayer(const std::shared_pt
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Exp>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Exp",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -506,7 +506,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Exp>::createLayer(const std::shared_ptr<
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::MVN>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "MVN",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::MVNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::MVN>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -529,7 +529,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::LRN>::createLayer(const std::shared_ptr<
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::LRN_IE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Norm",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::NormLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::LRN_IE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -545,7 +545,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::LRN_IE>::createLayer(const std::shared_p
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::CropIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Crop",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CropLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::CropIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -577,7 +577,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::CropIE>::createLayer(const std::shared_p
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Clamp>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Clamp",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ClampLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::Clamp>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -590,7 +590,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Clamp>::createLayer(const std::shared_pt
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::Softmax>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "SoftMax",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::SoftMaxLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::Softmax>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -602,7 +602,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::Softmax>::createLayer(const std::sha
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Subtract>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Eltwise",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
     res->params["operation"] = "sub";
     return res;
@@ -611,7 +611,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Subtract>::createLayer(const std::shared
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::Power>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Eltwise",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
     res->params["operation"] = "pow";
     return res;
@@ -620,7 +620,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::Power>::createLayer(const std::share
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::Maximum>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Eltwise",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
     res->params["operation"] = "max";
     return res;
@@ -634,7 +634,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::Minimum>::createLayer(const std::sha
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::Divide>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Eltwise",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
     res->params["operation"] = "div";
     return res;
@@ -643,7 +643,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::Divide>::createLayer(const std::shar
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::Multiply>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Eltwise",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
     res->params["operation"] = "prod";
     return res;
@@ -652,7 +652,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::Multiply>::createLayer(const std::sh
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::Add>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Eltwise",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
     res->params["operation"] = "sum";
     return res;
@@ -673,7 +673,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::BatchNormInference>::createLayer(
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Squeeze>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Squeeze",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::Squeeze>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -684,7 +684,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Squeeze>::createLayer(const std::shared_
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Unsqueeze>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Unsqueeze",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::Unsqueeze>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -695,7 +695,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Unsqueeze>::createLayer(const std::share
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::FakeQuantize>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "FakeQuantize",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::QuantizeLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::FakeQuantize>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -707,7 +707,7 @@ template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::ConvolutionIE>::createLayer(
         const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Convolution",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ConvolutionLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::ConvolutionIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -793,7 +793,7 @@ template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::DeconvolutionIE>::createLayer(
         const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Deconvolution",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::DeconvolutionLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::DeconvolutionIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -861,7 +861,7 @@ template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::DeformableConvolution>::createLayer(
         const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "DeformableConvolution",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::DeformableConvolutionLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::DeformableConvolution>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -935,7 +935,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::DeformableConvolution>::createLayer(
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::AvgPool>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Pooling",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::PoolingLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::AvgPool>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1001,7 +1001,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::AvgPool>::createLayer(const std::sha
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::MaxPool>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Pooling",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::PoolingLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::MaxPool>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1066,7 +1066,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::MaxPool>::createLayer(const std::sha
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::ROIPooling>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ROIPooling",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::ROIPooling>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1082,7 +1082,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::ROIPooling>::createLayer(const std::shar
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::PSROIPooling>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "PSROIPooling",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::PSROIPooling>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1101,7 +1101,7 @@ template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::DeformablePSROIPooling>::createLayer(
         const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "PSROIPooling",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::DeformablePSROIPooling>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1125,7 +1125,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::DeformablePSROIPooling>::createLayer
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::PRelu>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "PReLU",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::PReLULayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::PRelu>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1154,7 +1154,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::PRelu>::createLayer(const std::shared_pt
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::Split>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Split",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::SplitLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::Split>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1175,7 +1175,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::Split>::createLayer(const std::share
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::VariadicSplit>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Split",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::SplitLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::VariadicSplit>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1196,7 +1196,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::VariadicSplit>::createLayer(const std::s
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Concat>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Concat",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ConcatLayer>(params);
 
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::Concat>(layer);
@@ -1210,7 +1210,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Concat>::createLayer(const std::shared_p
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::GatherIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Gather",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::GatherLayer>(params);
 
     auto castedLayer = std::dynamic_pointer_cast<ngraph::op::GatherIE>(layer);
@@ -1229,14 +1229,14 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::GatherTree>::createLayer(const std::
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::GatherTreeIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "GatherTree",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
 
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::ReverseSequence>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
-    LayerParams params = {layer->get_friendly_name(), "ReverseSequence", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+    LayerParams params = {layer->get_friendly_name(), "ReverseSequence", details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReverseSequenceLayer>(params);
 
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::ReverseSequence>(layer);
@@ -1252,7 +1252,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::ReverseSequence>::createLayer(const std:
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Reshape>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Reshape",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReshapeLayer>(params);
     return res;
 }
@@ -1260,7 +1260,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Reshape>::createLayer(const std::shared_
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::ShapeOf>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ShapeOf",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -1268,7 +1268,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::ShapeOf>::createLayer(const std::shared_
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::Reshape>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Reshape",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
 
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::Reshape>(layer);
     if (castedLayer == nullptr)
@@ -1293,7 +1293,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::Reshape>::createLayer(const std::sha
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::PadIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Pad",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::PadLayer>(params);
 
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::PadIE>(layer);
@@ -1333,7 +1333,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::PadIE>::createLayer(const std::shared_pt
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::ScaleShiftIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ScaleShift",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ScaleShiftLayer>(params);
 
     NodeConverter<ngraph::op::Constant> converter;
@@ -1357,7 +1357,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::ScaleShiftIE>::createLayer(const std::sh
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Elu>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "elu",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::Elu>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1370,7 +1370,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Elu>::createLayer(const std::shared_ptr<
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::SquaredDifference>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Eltwise",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
     res->params["operation"] = "squared_diff";
     return res;
@@ -1380,7 +1380,7 @@ template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::DetectionOutput>::createLayer(
     const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "DetectionOutput",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
 
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::DetectionOutput>(layer);
@@ -1416,7 +1416,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::DetectionOutput>::createLayer(
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Transpose>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Permute",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
 
     NodeConverter<ngraph::op::Constant> converter;
@@ -1444,7 +1444,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Proposal>::createLayer(const std::shared
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::ProposalIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Proposal",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::ProposalIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1484,7 +1484,7 @@ template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::PriorBoxClusteredIE>::createLayer(
     const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "PriorBoxClustered",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::PriorBoxClusteredIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1533,7 +1533,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::PriorBoxClustered>::createLayer(
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::PriorBoxIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "PriorBox",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::PriorBoxIE>(layer);
     auto layer_info = params.type + " layer " + params.name;
@@ -1613,7 +1613,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::PriorBox>::createLayer(const std::shared
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::PowerIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Power",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::PowerLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::PowerIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1628,7 +1628,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::PowerIE>::createLayer(const std::shared_
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::TopK>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "TopK",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::TopKLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::TopK>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1672,7 +1672,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::TopK>::createLayer(const std::shared
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::TopKIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "TopK",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::TopKLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::TopKIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1687,7 +1687,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::TopKIE>::createLayer(const std::shared_p
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Eltwise>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Eltwise",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::EltwiseLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::Eltwise>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1712,7 +1712,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Eltwise>::createLayer(const std::shared_
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::TileIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Tile",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::TileLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::TileIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1725,7 +1725,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::TileIE>::createLayer(const std::shared_p
 
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::ResampleV2>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
-    LayerParams params = {layer->get_friendly_name(), "Resample", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+    LayerParams params = {layer->get_friendly_name(), "Resample", details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::ResampleV2>(layer);
     if (castedLayer == nullptr)
@@ -1752,7 +1752,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::ResampleV2>::createLayer(const std::shar
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Interp>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Resample",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::Interp>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
 
@@ -1766,7 +1766,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Interp>::createLayer(const std::shared_p
     }
 
     params = {layer->get_friendly_name(), "Interp",
-              details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+              details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
 
     res->params["height"] = asString(attrs.height);
@@ -1786,7 +1786,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Interpolate>::createLayer(const std::sha
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::FullyConnected>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "FullyConnected",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
 
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::FullyConnected>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1824,7 +1824,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::LSTMCell>::createLayer(const std::shared
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::LSTMCellIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "LSTMCell",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::LSTMCellIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
 
@@ -1872,7 +1872,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::LSTMCellIE>::createLayer(const std::shar
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::MatMul>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Gemm",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
 
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::MatMul>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1887,7 +1887,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::MatMul>::createLayer(const std::shared_p
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::RegionYolo>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "RegionYolo",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::RegionYolo>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1920,7 +1920,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::RegionYolo>::createLayer(const std::shar
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::ReorgYolo>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ReorgYolo",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::ReorgYolo>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1938,7 +1938,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::ReorgYolo>::createLayer(const std::share
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceMin>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ReduceMin",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceMin>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1950,7 +1950,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceMin>::createLayer(const std::s
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceMax>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ReduceMax",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceMax>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1962,7 +1962,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceMax>::createLayer(const std::s
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceMean>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ReduceMean",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceMean>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1974,7 +1974,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceMean>::createLayer(const std::
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceProd>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ReduceProd",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceProd>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -1986,7 +1986,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceProd>::createLayer(const std::
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceSum>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "ReduceSum",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::v1::ReduceSum>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -2003,7 +2003,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::NormalizeL2>::createLayer(const std::sha
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Log>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Log",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2011,7 +2011,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Log>::createLayer(const std::shared_ptr<
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::NormalizeIE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Normalize",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::NormLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::NormalizeIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -2025,6 +2025,8 @@ CNNLayer::Ptr NodeConverter<ngraph::op::NormalizeIE>::createLayer(const std::sha
     if (converter.canCreate(weightsNode)) {
         const auto& weights = converter.createLayer(weightsNode);
         res->blobs["weights"] = weights->blobs["custom"];
+    } else {
+        THROW_IE_EXCEPTION << "Cannot convert weight node for NormalizeIE op";
     }
 
     return res;
@@ -2034,7 +2036,7 @@ template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::CTCGreedyDecoder>::createLayer(
     const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "CTCGreedyDecoder",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = ngraph::as_type_ptr<ngraph::op::CTCGreedyDecoder>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -2046,7 +2048,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::CTCGreedyDecoder>::createLayer(
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Erf>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Erf",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2054,7 +2056,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Erf>::createLayer(const std::shared_ptr<
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Sign>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Sign",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2062,7 +2064,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Sign>::createLayer(const std::shared_ptr
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Sin>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Sin",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2070,7 +2072,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Sin>::createLayer(const std::shared_ptr<
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Sinh>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Sinh",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2078,7 +2080,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Sinh>::createLayer(const std::shared_ptr
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Asin>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Asin",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2086,7 +2088,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Asin>::createLayer(const std::shared_ptr
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Cos>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Cos",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2094,7 +2096,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Cos>::createLayer(const std::shared_ptr<
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Cosh>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Cosh",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2102,7 +2104,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Cosh>::createLayer(const std::shared_ptr
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Acos>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Acos",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2110,7 +2112,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Acos>::createLayer(const std::shared_ptr
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Tan>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Tan",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2118,7 +2120,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Tan>::createLayer(const std::shared_ptr<
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Atan>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Atan",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2126,7 +2128,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::Atan>::createLayer(const std::shared_ptr
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::Sqrt>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "Sqrt",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     return res;
 }
@@ -2142,7 +2144,7 @@ template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::StridedSliceIE>::createLayer(
         const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "StridedSlice",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::StridedSliceLayer>(params);
     auto castedLayer = std::dynamic_pointer_cast<ngraph::op::StridedSliceIE>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
@@ -2209,7 +2211,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::HardSigmoid>::createLayer(const std::sha
 
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::HardSigmoid_IE>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
-    LayerParams params = { layer->get_friendly_name(), "HardSigmoid", details::ngraph::convertPrecision(layer->get_output_element_type(0)) };
+    LayerParams params = { layer->get_friendly_name(), "HardSigmoid", details::convertPrecision(layer->get_output_element_type(0)) };
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     auto castedLayer = std::dynamic_pointer_cast<ngraph::op::HardSigmoid_IE>(layer);
     if (castedLayer == nullptr)
@@ -2223,7 +2225,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::HardSigmoid_IE>::createLayer(const std::
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::GRN>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
     LayerParams params = {layer->get_friendly_name(), "GRN",
-                          details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+                          details::convertPrecision(layer->get_output_element_type(0))};
     auto castedLayer = std::dynamic_pointer_cast<ngraph::op::GRN>(layer);
     if (castedLayer == nullptr) THROW_IE_EXCEPTION << "Cannot get " << params.type << " layer " << params.name;
 
@@ -2234,7 +2236,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::GRN>::createLayer(const std::shared_ptr<
 
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::LogicalNot>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
-    LayerParams params = {layer->get_friendly_name(), "Activation", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+    LayerParams params = {layer->get_friendly_name(), "Activation", details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::CNNLayer>(params);
     res->params["type"] = "not";
     return res;
@@ -2242,7 +2244,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::LogicalNot>::createLayer(const std::
 
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceLogicalAnd>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
-    LayerParams params = {layer->get_friendly_name(), "ReduceAnd", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+    LayerParams params = {layer->get_friendly_name(), "ReduceAnd", details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
 
     auto castedLayer = std::dynamic_pointer_cast<ngraph::op::v1::ReduceLogicalAnd>(layer);
@@ -2254,7 +2256,7 @@ CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceLogicalAnd>::createLayer(const
 
 template <>
 CNNLayer::Ptr NodeConverter<ngraph::op::v1::ReduceLogicalOr>::createLayer(const std::shared_ptr<ngraph::Node>& layer) const {
-    LayerParams params = {layer->get_friendly_name(), "ReduceOr", details::ngraph::convertPrecision(layer->get_output_element_type(0))};
+    LayerParams params = {layer->get_friendly_name(), "ReduceOr", details::convertPrecision(layer->get_output_element_type(0))};
     auto res = std::make_shared<InferenceEngine::ReduceLayer>(params);
 
     auto castedLayer = std::dynamic_pointer_cast<ngraph::op::v1::ReduceLogicalOr>(layer);
@@ -93,7 +93,7 @@ private:
 
     Blob::Ptr shareWeights(const std::shared_ptr<ngraph::op::Constant>& constLayer) const {
         if (!constLayer) THROW_IE_EXCEPTION << "Cannot share weights! Constant operation is empty!";
-        auto dataPrecision = details::ngraph::convertPrecision(constLayer->get_element_type());
+        auto dataPrecision = details::convertPrecision(constLayer->get_element_type());
 
         size_t shapeSize = ngraph::shape_size(constLayer->get_shape());
         if (dataPrecision == Precision::BIN) {
index ccf9f74..9c38fe6 100644 (file)
@@ -2447,7 +2447,9 @@ void PriorBoxClusteredValidator::checkShapes(const CNNLayer* layer, const std::v
 PriorBoxClusteredValidator::PriorBoxClusteredValidator(const std::string& _type): LayerValidator(_type) {}
 
 void ProposalValidator::parseParams(CNNLayer* layer) {
-    layer->params["num_outputs"] = std::to_string(layer->outData.size());
+    if (layer->params.find("num_outputs") == layer->params.end()) {
+        layer->params["num_outputs"] = std::to_string(layer->outData.size());
+    }
 }
 
 void ProposalValidator::checkParams(const CNNLayer* layer) {
@@ -3074,52 +3076,55 @@ void NMSValidator::checkShapes(const CNNLayer* layer, const vector<SizeVector>&
         THROW_IE_EXCEPTION << layer->name << " 'score_threshold' should be scalar";
 }
 
-ScatterValidator::ScatterValidator(const std::string& _type): LayerValidator(_type) {}
+ScatterUpdateValidator::ScatterUpdateValidator(const std::string& _type): LayerValidator(_type) {}
 
-void ScatterValidator::parseParams(CNNLayer* layer) {
-    auto casted = dynamic_cast<ScatterLayer*>(layer);
+void ScatterUpdateValidator::parseParams(CNNLayer* layer) {
+    auto casted = dynamic_cast<ScatterUpdateLayer*>(layer);
     if (!casted) {
-        THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ScatterLayer class";
+        THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ScatterUpdateLayer class";
     }
-
-    casted->axis = casted->GetParamAsInt("axis", 0);
 }
 
-void ScatterValidator::checkShapes(const CNNLayer* layer, const vector<SizeVector>& inShapes) const {
-    auto casted = dynamic_cast<const ScatterLayer*>(layer);
+void ScatterUpdateValidator::checkShapes(const CNNLayer* layer, const vector<SizeVector>& inShapes) const {
+    auto casted = dynamic_cast<const ScatterUpdateLayer*>(layer);
     if (!casted) {
-        THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ScatterLayer class";
+        THROW_IE_EXCEPTION << layer->name << " Layer is not instance of ScatterUpdateLayer class";
     }
 
     size_t numInputs = inShapes.size();
-    if (numInputs != 3)
-        THROW_IE_EXCEPTION << layer->name << " Scatter can take only 3 inputs, but actually it has: " << numInputs;
+    if (numInputs != 4)
+        THROW_IE_EXCEPTION << layer->name << " Scatter can take only 4 inputs, but actually it has: " << numInputs;
 
-    if (!(-static_cast<int>(inShapes[0].size()) <= casted->axis && casted->axis < static_cast<int>(inShapes[0].size())))
-        THROW_IE_EXCEPTION << layer->name << " Incorrect input parameters dimensions and axis number!";
+    static constexpr int DATA = 0;
+    static constexpr int INDICES = 1;
+    static constexpr int UPDATES = 2;
+    static constexpr int AXIS = 3;
 
-    if (inShapes[0].size() == 0 || (inShapes[0].size() == 1 && inShapes[0][0] == 1))
-        THROW_IE_EXCEPTION << layer->name << " 'Data' tensor rank should be >= 1";
+    if (inShapes[DATA].size() < 1)
+        THROW_IE_EXCEPTION << layer->name << " 'Data' tensor rank must be >= 1";
 
-    if (inShapes[1].size() == 0 || (inShapes[1].size() == 1 && inShapes[1][0] == 1))
-        THROW_IE_EXCEPTION << layer->name << " 'Indexes' tensor rank should be >= 1";
+    if (inShapes[INDICES].size() < 1)
+        THROW_IE_EXCEPTION << layer->name << " 'Indices' tensor rank must be >= 1";
 
-    if (inShapes[1].size() == 0 || (inShapes[1].size() == 1 && inShapes[1][0] == 1))
-        THROW_IE_EXCEPTION << layer->name << " 'Updates' tensor rank should be >= 1";
+    if (inShapes[UPDATES].size() < 1)
+        THROW_IE_EXCEPTION << layer->name << " 'Updates' tensor rank must be >= 1";
 
-    if (inShapes[1] != inShapes[2])
-        THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'indexes' and 'updates' tensors dimension";
+    if (!(inShapes[AXIS].size() == 1 && inShapes[AXIS][0] == 1))
+        THROW_IE_EXCEPTION << layer->name << " 'Axis' tensor must be 1D array of 1 element";
 
-    const size_t SCATTER_DATA = 0;
-    const size_t SCATTER_INDEXES = 1;
-    const size_t SCATTER_UPDATES = 2;
+    if (inShapes[UPDATES].size() != inShapes[INDICES].size() + inShapes[DATA].size() - 1)
+        THROW_IE_EXCEPTION << layer->name << " Incorrect number of 'indexes' and 'updates' tensors dimension";
 
-    Precision inIdxPrecision = layer->insData[SCATTER_INDEXES].lock()->getTensorDesc().getPrecision();
+    Precision inIdxPrecision = layer->insData[INDICES].lock()->getTensorDesc().getPrecision();
     if (inIdxPrecision != Precision::FP32 && inIdxPrecision != Precision::I32)
-        THROW_IE_EXCEPTION << layer->name << " Incorrect input 'Indexes' precision. Only FP32 or I32 are supported!";
+        THROW_IE_EXCEPTION << layer->name << " Incorrect input 'Indices' precision. Only FP32 or I32 are supported!";
+
+    Precision inAxisPrecision = layer->insData[AXIS].lock()->getTensorDesc().getPrecision();
+    if (inAxisPrecision != Precision::FP32 && inAxisPrecision != Precision::I32)
+        THROW_IE_EXCEPTION << layer->name << " Incorrect input 'Axis' precision. Only FP32 or I32 are supported!";
 
-    if (layer->insData[SCATTER_DATA].lock()->getTensorDesc().getPrecision() !=
-        layer->insData[SCATTER_UPDATES].lock()->getTensorDesc().getPrecision())
+    if (layer->insData[DATA].lock()->getTensorDesc().getPrecision() !=
+        layer->insData[UPDATES].lock()->getTensorDesc().getPrecision())
         THROW_IE_EXCEPTION << layer->name << " Precision should be equal for input tensors 'Data' and 'Updates'";
 }
 
@@ -3248,7 +3253,7 @@ LayerValidators::LayerValidators() {
     REG_LAYER_VALIDATOR_FOR_TYPE(TopKValidator, TopK);
     REG_LAYER_VALIDATOR_FOR_TYPE(UniqueValidator, Unique);
     REG_LAYER_VALIDATOR_FOR_TYPE(NMSValidator, NonMaxSuppression);
-    REG_LAYER_VALIDATOR_FOR_TYPE(ScatterValidator, ScatterUpdate);
+    REG_LAYER_VALIDATOR_FOR_TYPE(ScatterUpdateValidator, ScatterUpdate);
 }
 
 }  // namespace InferenceEngine
index 0ac1c97..6cbd18d 100644 (file)
@@ -969,9 +969,9 @@ public:
     void checkShapes(const CNNLayer* layer, const std::vector<SizeVector>& inShapes) const override;
 };
 
-class ScatterValidator : public LayerValidator {
+class ScatterUpdateValidator : public LayerValidator {
 public:
-    explicit ScatterValidator(const std::string& _type);
+    explicit ScatterUpdateValidator(const std::string& _type);
 
     void parseParams(CNNLayer* layer) override;
 
index f561ddc..e58e562 100644 (file)
@@ -67,6 +67,7 @@ ReduceLayer::~ReduceLayer() {}
 TopKLayer::~TopKLayer() {}
 UniqueLayer::~UniqueLayer() {}
 NonMaxSuppressionLayer::~NonMaxSuppressionLayer() {}
-ScatterLayer::~ScatterLayer() {}
+ScatterUpdateLayer::~ScatterUpdateLayer() {}
 ExperimentalDetectronPriorGridGeneratorLayer::~ExperimentalDetectronPriorGridGeneratorLayer() {}
 ExperimentalDetectronGenerateProposalsSingleImageLayer::~ExperimentalDetectronGenerateProposalsSingleImageLayer() {}
+ExperimentalDetectronTopKROIs::~ExperimentalDetectronTopKROIs() {}
index 93f0317..8fb75b4 100644 (file)
@@ -77,9 +77,10 @@ CNNLayerPtr layerCloneImpl<TensorIterator>(const CNNLayer* source) {
 CNNLayerPtr clonelayer(const CNNLayer& source) {
     using fptr = CNNLayerPtr (*)(const CNNLayer*);
     // Most derived layers must go first in this list
-    static const fptr cloners[] = {&layerCloneImpl<ExperimentalDetectronGenerateProposalsSingleImageLayer>,
+    static const fptr cloners[] = {&layerCloneImpl<ExperimentalDetectronTopKROIs>,
+                                   &layerCloneImpl<ExperimentalDetectronGenerateProposalsSingleImageLayer>,
                                    &layerCloneImpl<ExperimentalDetectronPriorGridGeneratorLayer>,
-                                   &layerCloneImpl<ScatterLayer>,
+                                   &layerCloneImpl<ScatterUpdateLayer>,
                                    &layerCloneImpl<NonMaxSuppressionLayer>,
                                    &layerCloneImpl<SelectLayer>,
                                    &layerCloneImpl<BatchNormalizationLayer>,
@@ -145,6 +146,35 @@ CNNLayerPtr clonelayer(const CNNLayer& source) {
     return nullptr;  // Silence "control may reach end of non-void function" warning
 }
 
+std::shared_ptr<ICNNNetwork> cloneNetwork(const ICNNNetwork& network) {
+    if (auto func = network.getFunction()) {
+        CNNNetwork net(func);
+
+        InputsDataMap originInputs;
+        OutputsDataMap originOutputs;
+        network.getInputsInfo(originInputs);
+        network.getOutputsInfo(originOutputs);
+        InputsDataMap clonedInputs = net.getInputsInfo();
+        OutputsDataMap clonedOutputs = net.getOutputsInfo();
+
+        for (const auto& outputInfo : originOutputs) {
+            if (clonedOutputs.find(outputInfo.first) == clonedOutputs.end())
+                THROW_IE_EXCEPTION << "Cannot clone network! Cloned network doesn't contain all outputs";
+            clonedOutputs[outputInfo.first]->setPrecision(outputInfo.second->getPrecision());
+            clonedOutputs[outputInfo.first]->setLayout(outputInfo.second->getLayout());
+        }
+        for (const auto& inputInfo : originInputs) {
+            if (clonedInputs.find(inputInfo.first) == clonedInputs.end())
+                THROW_IE_EXCEPTION << "Cannot clone network! Cloned network doesn't contain all inputs";
+            clonedInputs[inputInfo.first]->setPrecision(inputInfo.second->getPrecision());
+            clonedInputs[inputInfo.first]->setLayout(inputInfo.second->getLayout());
+            clonedInputs[inputInfo.first]->getPreProcess() = inputInfo.second->getPreProcess();
+        }
+        return net;
+    }
+
+    return cloneNet(network);
+}
 details::CNNNetworkImplPtr cloneNet(const ICNNNetwork& network) {
     std::vector<CNNLayerPtr> layers;
     details::CNNNetworkIterator i(&network);
index 0cce4e1..294d574 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index f36d535..442dc5f 100644 (file)
@@ -487,6 +487,7 @@ std::size_t FillXmlDoc(const InferenceEngine::ICNNNetwork& network, pugi::xml_do
         if (dumpWeights && !node->blobs.empty()) {
             auto blobsNode = layer.append_child("blobs");
             for (const auto& dataIt : node->blobs) {
+                if (!dataIt.second) continue;
                 size_t dataSize = dataIt.second->byteSize();
                 pugi::xml_node data = blobsNode.append_child(dataIt.first.c_str());
                 data.append_attribute("offset").set_value(dataOffset);
@@ -546,11 +547,12 @@ void SerializeBlobs(std::ostream& stream, const InferenceEngine::ICNNNetwork& ne
     for (auto&& node : ordered) {
         if (!node->blobs.empty()) {
             for (const auto& dataIt : node->blobs) {
+                if (!dataIt.second) continue;
                 const char* dataPtr = dataIt.second->buffer().as<char*>();
                 size_t dataSize = dataIt.second->byteSize();
                 stream.write(dataPtr, dataSize);
                 if (!stream.good()) {
-                    THROW_IE_EXCEPTION << "Error during writing blob waights";
+                    THROW_IE_EXCEPTION << "Error during writing blob weights";
                 }
             }
         }
index 1ae2e61..8dd5294 100644 (file)
@@ -248,7 +248,7 @@ REG_SHAPE_INFER_FOR_TYPE(GatherTreeShapeProp, GatherTree);
 REG_SHAPE_INFER_FOR_TYPE(TopKShapeProp, TopK);
 REG_SHAPE_INFER_FOR_TYPE(UniqueShapeProp, Unique);
 REG_SHAPE_INFER_FOR_TYPE(NMSShapeProp, NonMaxSuppression);
-REG_SHAPE_INFER_FOR_TYPE(ScatterShapeProp, Scatter);
+REG_SHAPE_INFER_FOR_TYPE(ScatterUpdateShapeProp, ScatterUpdate);
 
 }  // namespace ShapeInfer
 }  // namespace InferenceEngine
index 19cfbb8..04b7f6a 100644 (file)
@@ -15,19 +15,19 @@ namespace InferenceEngine {
 namespace ShapeInfer {
 
 /**
- *@brief Implementation of Shape inference for Scatter layer
+ *@brief Implementation of Shape inference for ScatterUpdate layer
  */
-class ScatterShapeProp : public BuiltInShapeInferImpl {
+class ScatterUpdateShapeProp : public BuiltInShapeInferImpl {
 public:
-    explicit ScatterShapeProp(const std::string& type): BuiltInShapeInferImpl(type) {}
+    explicit ScatterUpdateShapeProp(const std::string& type): BuiltInShapeInferImpl(type) {}
 
     void inferShapesImpl(const std::vector<Blob::CPtr>& inBlobs, const std::map<std::string, std::string>& params,
                          const std::map<std::string, Blob::Ptr>& blobs, std::vector<SizeVector>& outShapes) override {
         LayerParams lp {};
-        ScatterLayer scatterLayer(lp);
-        scatterLayer.params = params;
-        scatterLayer.type = _type;
-        validate(&scatterLayer, inBlobs, params, blobs);
+        ScatterUpdateLayer scatterUpdateLayer(lp);
+        scatterUpdateLayer.params = params;
+        scatterUpdateLayer.type = _type;
+        validate(&scatterUpdateLayer, inBlobs, params, blobs);
 
         outShapes = {inShapes[0]};
     }
index aa27efe..1067869 100644 (file)
@@ -33,7 +33,6 @@ target_include_directories(${TARGET_NAME} PUBLIC ${PUBLIC_HEADERS_DIR}
        $<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
 
 add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
-add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME})
 
 # developer package
 
@@ -42,6 +41,6 @@ ie_developer_export_targets(${TARGET_NAME})
 # install
 
 install(TARGETS ${TARGET_NAME}
-        RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
-        ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+        RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+        ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
         LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
diff --git a/inference-engine/src/mkldnn_plugin/bf16transformer.cpp b/inference-engine/src/mkldnn_plugin/bf16transformer.cpp
new file mode 100644 (file)
index 0000000..0558bd1
--- /dev/null
@@ -0,0 +1,238 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bf16transformer.h"
+#include <string>
+#include <vector>
+#include <fstream>
+#include <utility>
+#include <set>
+#include <chrono>
+#include "details/ie_cnn_network_tools.h"
+#include "ie_util_internal.hpp"
+#include "ngraph/type/bfloat16.hpp"
+
+using namespace MKLDNNPlugin;
+using namespace InferenceEngine;
+using namespace InferenceEngine::details;
+
+void precisionColoringBF16(const CNNLayerPtr layer,
+                           ordered_properties &printed_properties,
+                           ordered_properties &node_properties) {
+    if (layer && !layer->insData.empty() && layer->input()) {
+        printed_properties.insert(printed_properties.begin(),
+                                  std::pair<std::string, std::string>("Precision",
+                                                                      layer->input()->getPrecision() == Precision::FP32 ? "FP32" : "BF16"));
+
+        if (layer->input()->getPrecision() == Precision::FP32) {
+            node_properties.emplace_back("fillcolor", "#5A5DF0");
+        } else {
+            node_properties.emplace_back("fillcolor", "#20F608");
+        }
+    }
+}
+
+void BF16Transformer::convertToFloat(InferenceEngine::CNNNetwork &network) {
+    // go over all edges and all edges having FP32 mark as BF16
+    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+    InputsDataMap inputs = network.getInputsInfo();
+    OutputsDataMap outputs = network.getOutputsInfo();
+    for (auto iter : sortedLayers) {
+        for (size_t o = 0; o < iter->outData.size(); o++) {
+            if (inputs.find(iter->outData[o]->getName()) == inputs.end()
+                && outputs.find(iter->outData[o]->getName()) == outputs.end()
+                && iter->outData[o]->getPrecision() == Precision::BF16) {
+                iter->outData[o]->setPrecision(Precision::FP32);
+            }
+        }
+    }
+}
+
+void BF16Transformer::convertToBFloat16(InferenceEngine::CNNNetwork &network) {
+    // go over all edges and all edges having FP32 mark as BF16
+    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+    InputsDataMap inputs = network.getInputsInfo();
+    OutputsDataMap outputs = network.getOutputsInfo();
+    for (auto iter : sortedLayers) {
+        for (size_t o = 0; o < iter->outData.size(); o++) {
+            if (inputs.find(iter->outData[o]->getName()) == inputs.end()
+                && outputs.find(iter->outData[o]->getName()) == outputs.end()
+                && iter->outData[o]->getPrecision() == Precision::FP32) {
+                iter->outData[o]->setPrecision(Precision::BF16);
+            }
+        }
+    }
+
+    // convert all edges back to FP32 on demand
+    optimizeToFloat(network);
+}
+
+void BF16Transformer::optimizeToFloat(InferenceEngine::CNNNetwork &network) {
+    std::set<DataPtr> toAnalyzeTensors;
+    std::set<DataPtr> immutable;
+    bool hasBF16Tensor = false;
+    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+    // 1. Verify if we do not have bf16 tensors - it's better to return early and not to try to return anything since there is no such tensors
+    for (auto iter : sortedLayers) {
+        for (size_t i = 0; i < iter->insData.size(); i++) {
+            if (iter->insData[i].lock()->getTensorDesc().getPrecision() == Precision::BF16) {
+                hasBF16Tensor = true;
+            }
+        }
+        for (size_t o = 0; o < iter->outData.size(); o++) {
+            if (iter->outData[o]->getTensorDesc().getPrecision() == Precision::BF16) {
+                hasBF16Tensor = true;
+            }
+        }
+    }
+    if (!hasBF16Tensor) {
+        return;
+    }
+    // 2a. go over all inputs and outputs and put them to the toAnalyzeTensors
+    InputsDataMap inputs = network.getInputsInfo();
+    for (auto input : inputs) {
+        immutable.insert(input.second->getInputData());
+        if (input.second->getInputData()->getTensorDesc().getPrecision() != Precision::BF16) {
+            toAnalyzeTensors.insert(input.second->getInputData());
+        }
+    }
+
+    OutputsDataMap outputs = network.getOutputsInfo();
+    for (auto output : outputs) {
+        immutable.insert(output.second);
+        if (output.second->getTensorDesc().getPrecision() != Precision::BF16) {
+            toAnalyzeTensors.insert(output.second);
+        }
+    }
+
+    // 2b. go over all unknown layers for this algo and mark them as fp32 and add to the toAnalyzeTensors
+    // 2c. go over all inputs to _initbf16 and if they are fp32 - add them to the toAnalyzeTensors
+    for (auto iter : sortedLayers) {
+        if (_initbf16.find(iter->type) == _initbf16.end()
+            && _complementbf16.find(iter->type) == _complementbf16.end()
+            && _multiinput.find(iter->type) == _multiinput.end()) {
+            // try to mark inputs of the unknown layer
+            for (size_t i = 0; i < iter->insData.size(); i++) {
+                if (iter->insData[i].lock()->getPrecision() == Precision::BF16) {
+                    bool marked = tryToMarkFP32(iter->insData[i].lock(), immutable);
+                    if (marked) {
+                        toAnalyzeTensors.insert(iter->insData[i].lock());
+                    }
+                }
+            }
+            // try to mark outputs of the unknown layer
+            for (size_t o = 0; o < iter->outData.size(); o++) {
+                if (iter->outData[o]->getPrecision() == Precision::BF16) {
+                    bool marked = tryToMarkFP32(iter->outData[o], immutable);
+                    if (marked) {
+                        toAnalyzeTensors.insert(iter->outData[o]);
+                    }
+                }
+            }
+        }
+        if (_initbf16.find(iter->type) != _initbf16.end()) {
+            // verify if input activation tensor is not bf16 - add to toAnalyzeTensors as well
+            // we are assuming here that _initbf16 contain only layers having one dynamic input
+            // in other case algorithm should be changed to care about two dynamic input tensors
+            // and take into account case of different precision if they are
+            if (iter->insData[0].lock()->getTensorDesc().getPrecision() != Precision::BF16) {
+                toAnalyzeTensors.insert(iter->insData[0].lock());
+                // output tensor for FP32 convolutoin/FC layers should be FP32 as well
+                for (size_t o = 0; o < iter->outData.size(); o++) {
+                    if (iter->outData[o]->getPrecision() == Precision::BF16) {
+                        bool marked = tryToMarkFP32(iter->outData[o], immutable);
+                        if (marked) {
+                            toAnalyzeTensors.insert(iter->outData[o]);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // 3 - while toAnalyzeTensors is not empty look at the layers dealing with tensors mentioned in toAnalyzeTensors
+    while (!toAnalyzeTensors.empty()) {
+        DataPtr tensor = *toAnalyzeTensors.begin();
+        toAnalyzeTensors.erase(tensor);
+        // look into producer of the tensor
+        auto layer = tensor->getCreatorLayer().lock();
+        // if this layer is not from _initbf16 - analyze inputs
+        if (_initbf16.find(layer->type) == _initbf16.end()) {
+            // for all inputs investigate and modify tensor precision if required
+            for (size_t i = 0; i < layer->insData.size(); i++) {
+                bool marked = tryToMarkFP32(layer->insData[i].lock(), immutable);
+                if (marked) {
+                    toAnalyzeTensors.insert(layer->insData[i].lock());
+                }
+            }
+        }
+
+        // mark all produced tensors to FP32 if they are BF16 and if they do not go _only_ to the toAnalyzeTensors
+        // TODO: when we enable greedy mode and start to produce bf16 tensor even if one consumer accepts it,
+        // this place should be changed.
+        // Instead of "if they do not go _only_ to the toAnalyzeTensors" we have to apply "if they do not go at least to one of _initbf16"
+        // TODO: add test input1->pooling1->conv1 and the same pooling1->relu. for example. now convolution should be returned to fp32
+        // after greedy mode, it should be fp32.
+        for (auto inputTo : tensor->getInputTo()) {
+            for (size_t o = 0; o < inputTo.second->outData.size(); o++) {
+                if (inputTo.second->outData[o]->getTensorDesc().getPrecision() == Precision::BF16) {
+                    bool marked = tryToMarkFP32(inputTo.second->outData[o], immutable);
+                    if (marked) {
+                        toAnalyzeTensors.insert(layer->outData[o]);
+                    }
+                }
+            }
+        }
+    }
+
+#ifndef NDEBUG
+    {
+        std::ofstream file("bf16_icnnnetwork.dot");
+        saveGraphToDot(network, file, precisionColoringBF16);
+    }
+#endif
+}
+
+bool BF16Transformer::tryToMarkFP32(InferenceEngine::DataPtr data, const std::set<InferenceEngine::DataPtr>& immutable) {
+    bool marked = false;
+    if (immutable.find(data) == immutable.end() && data->getPrecision() == Precision::BF16) {
+        // we treat one consumer and many in different ways
+        // if there is one consumer, we can mark its input as float if it does not belong to the list of initial layers
+        // in other cases we need to mark tensor which is passed to several l ayers as FP32 only if there is at least one conusmer
+        // produces data in FP32. I.e. there should be a way fo getting FP32 from output data to this point
+        if (data->getInputTo().size() == 1) {
+            if (_initbf16.find(data->getInputTo().begin()->second->type) == _initbf16.end()) {
+                marked = true;
+            }
+        } else {
+            // get all consumers
+            for (auto o : data->getInputTo()) {
+                // if tensor goes to several layers, we will mark it by FP32 only if one of the layer is unknown
+                if (_initbf16.find(o.second->type) == _initbf16.end() &&
+                    _complementbf16.find(o.second->type) == _complementbf16.end() &&
+                    _multiinput.find(o.second->type) == _multiinput.end()) {
+                    marked = true;
+                }
+            }
+        }
+        if (marked) {
+            data->setPrecision(Precision::FP32);
+        }
+    }
+    return marked;
+}
+
+InferenceEngine::MemoryBlob::Ptr BF16Transformer::convertBF16ToFloat(InferenceEngine::MemoryBlob::Ptr tweights) {
+    TensorDesc td(Precision::FP32, tweights->getTensorDesc().getDims(), tweights->getTensorDesc().getLayout());
+    MemoryBlob::Ptr weightsFP32 = make_shared_blob<float>(td);
+    weightsFP32->allocate();
+    auto lmbf16 = tweights->rmap();
+    short *bf16data = lmbf16.as<short *>();
+    auto lmfp32 = weightsFP32->wmap();
+    float *fp32data = lmfp32.as<float *>();
+    for (size_t i = 0; i < weightsFP32->size(); i++) {
+        fp32data[i] = ngraph::bfloat16::from_bits(bf16data[i]);
+    }
+    return weightsFP32;
+}
diff --git a/inference-engine/src/mkldnn_plugin/bf16transformer.h b/inference-engine/src/mkldnn_plugin/bf16transformer.h
new file mode 100644 (file)
index 0000000..22becc6
--- /dev/null
@@ -0,0 +1,63 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <details/caseless.hpp>
+#include <string>
+#include <set>
+#include "inference_engine.hpp"
+
+namespace MKLDNNPlugin {
+
+class BF16Transformer {
+    const InferenceEngine::details::caseless_set<std::string> _initbf16 =
+        { "convolution", "fullyconnected", "innerproduct" };
+    const InferenceEngine::details::caseless_set<std::string> _complementbf16 =
+        { "relu", "pooling", "norm", "gather" };
+    const InferenceEngine::details::caseless_set<std::string> _multiinput =
+        { "concat", "eltwise" };
+
+    /**
+    * Tries to mark tensor as FP32 by analyzing of local consumers of the tensor. Do not mark if
+    *
+    * 1. tensor goes to init layer (conv of fc)
+    * 2. goes to the layers which can work with BF16
+    *
+    * if tensor goes to layer not supporting BF16, this tensor will be marked as FP32
+    */
+    bool tryToMarkFP32(InferenceEngine::DataPtr data, const std::set<InferenceEngine::DataPtr> &immutable);
+
+public:
+    /**
+     * Restores Float point data types on edges which goes to non supported layers
+     *
+     * Algo:
+     * 1. Verify if we do not have bf16 tensors it's better to return early and not to try to return
+     * anything since there is no such tensors
+     * 2a. go over all inputs and outputs and if data type is not BF16, put them to the toAnalyzeTensors
+     * 2b. go over all unknown layers for this algo and mark them as fp32 and add their inputs and
+     * outputs to the toAnalyzeTensors and try to mark them as FP32
+     * 2c. go over all inputs to _initbf16 and if they are fp32 add them to the toAnalyzeTensors
+     *
+     * 3 - while toAnalyzeTensors is not empty look at the layers dealing with tensors mentioned in
+     * toAnalyzeTensors, analyze parent and children and depending on the type of the layers try to
+     * extend FP32 data type
+    */
+    void optimizeToFloat(InferenceEngine::CNNNetwork &network);
+
+    /**
+     * Converts all edges from bfloat16 to float data type. Do not touch input and output nodes
+     */
+    void convertToFloat(InferenceEngine::CNNNetwork &network);
+
+    /**
+    * converts all fp32 edges excepting inputs and outputs to bf16 and call restoreFloatPrecision
+    */
+    void convertToBFloat16(InferenceEngine::CNNNetwork &network);
+
+    InferenceEngine::MemoryBlob::Ptr convertBF16ToFloat(InferenceEngine::MemoryBlob::Ptr);
+};
+
+}  // namespace MKLDNNPlugin
index 694b6d0..3ff1172 100644 (file)
@@ -72,6 +72,12 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
             dumpQuantizedGraphToDot = val;
         } else if (key.compare(PluginConfigParams::KEY_DUMP_QUANTIZED_GRAPH_AS_IR) == 0) {
             dumpQuantizedGraphToIr = val;
+        } else if (key == PluginConfigParams::KEY_ENFORCE_BF16) {
+            if (val == PluginConfigParams::YES) enforceBF16 = true;
+            else if (val == PluginConfigParams::NO) enforceBF16 = false;
+            else
+                THROW_IE_EXCEPTION << "Wrong value for property key " << PluginConfigParams::KEY_ENFORCE_BF16
+                    << ". Expected only YES/NO";
         } else {
             THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported property " << key << " by CPU plugin";
         }
@@ -112,6 +118,10 @@ void Config::updateProperties() {
         _config.insert({ PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS, std::to_string(streamExecutorConfig._streams) });
         _config.insert({ PluginConfigParams::KEY_CPU_THREADS_NUM, std::to_string(streamExecutorConfig._threads) });
         _config.insert({ PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT, dumpToDot });
+        if (enforceBF16)
+            _config.insert({ PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES });
+        else
+            _config.insert({ PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO });
     }
 }
 
index 0007bc5..2444f00 100644 (file)
@@ -32,6 +32,7 @@ struct Config {
     std::string dumpQuantizedGraphToDot = "";
     std::string dumpQuantizedGraphToIr = "";
     int batchLimit = 0;
+    bool enforceBF16 = false;
     InferenceEngine::IStreamsExecutor::Config streamExecutorConfig;
 
 #if defined(__arm__) || defined(__aarch64__)
index 65eaec7..f444ff5 100644 (file)
@@ -10,6 +10,7 @@
 #include "mkldnn_async_infer_request.h"
 #include "mkldnn_infer_request.h"
 #include "mkldnn_memory_state.h"
+#include "bf16transformer.h"
 #include <ie_util_internal.hpp>
 #include <graph_tools.hpp>
 #include <cnn_network_int8_normalizer.hpp>
@@ -100,6 +101,19 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::ICNNNetwork &network
                     LayerTransformation::Params(params).setPrecisionsOnActivations({ Precision::U8 }),
                     "ScaleShift"));
             transformer.transform(*_clonedNetwork);
+            if (with_cpu_x86_bfloat16()) {
+                BF16Transformer bf16Transformer;
+                CNNNetwork cnnetwork(_clonedNetwork);
+                if (cfg.enforceBF16 == true) {
+                    bf16Transformer.convertToBFloat16(cnnetwork);
+                } else {
+                    bf16Transformer.optimizeToFloat(cnnetwork);
+                }
+            } else {
+                BF16Transformer bf16Transformer;
+                CNNNetwork cnnetwork(_clonedNetwork);
+                bf16Transformer.convertToFloat(cnnetwork);
+            }
         }
     }
 
index 494a00d..0ab3e0a 100644 (file)
@@ -17,6 +17,8 @@ uint8_t MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type dataType)
         return 4;
     case mkldnn::memory::data_type::s16:
         return 2;
+    case mkldnn::memory::data_type::bf16:
+        return 2;
     case mkldnn::memory::data_type::s8:
         return 1;
     case mkldnn::memory::data_type::u8:
@@ -39,6 +41,8 @@ memory::data_type MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::P
             return memory::s32;
         case InferenceEngine::Precision::I16:
             return memory::s16;
+        case InferenceEngine::Precision::BF16:
+            return memory::bf16;
         case InferenceEngine::Precision::I8:
             return memory::s8;
         case InferenceEngine::Precision::U8:
@@ -61,6 +65,8 @@ InferenceEngine::Precision MKLDNNExtensionUtils::DataTypeToIEPrecision(memory::d
             return InferenceEngine::Precision::I32;
         case memory::s16:
             return InferenceEngine::Precision::I16;
+        case memory::bf16:
+            return InferenceEngine::Precision::BF16;
         case memory::s8:
             return InferenceEngine::Precision::I8;
         case memory::u8:
index 8cac262..8d72160 100644 (file)
@@ -701,6 +701,7 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
         return activationNode &&
             (activationNode->getAlgorithm() == eltwise_relu ||
             (conv->getCnnLayer()->precision == Precision::FP32 &&
+             conv->getCnnLayer()->insData[0].lock()->getPrecision() != Precision::BF16 &&
              isOneOf(activationNode->getAlgorithm(), {eltwise_elu, eltwise_logistic, eltwise_bounded_relu, eltwise_clamp, eltwise_swish})));
     };
 
@@ -774,6 +775,7 @@ void MKLDNNGraphOptimizer::FuseFullyConnectedAndSimpleOperation(MKLDNNGraph &gra
 
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
         return node->getType() == FullyConnected &&
+               node->getCnnLayer()->insData[0].lock()->getPrecision() != Precision::BF16 &&
                node->getChildEdges().size() == 1;
     };
 
@@ -845,7 +847,9 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
         bool isSutableConv = (node->getType() == Convolution) &&
                              node->getCnnLayer()->precision == Precision::FP32;
         bool isSutableBinConv = node->getType() == BinaryConvolution;
-        return (isSutableConv || isSutableBinConv) && node->getChildEdges().size() == 1;
+        return (isSutableConv || isSutableBinConv) && node->getChildEdges().size() == 1 &&
+               !(node->getCnnLayer()->insData[0].lock()->getPrecision() == Precision::BF16 &&
+                 node->getCnnLayer()->outData[0]->getPrecision() == Precision::FP32);
     };
 
     auto isSutableChildNode = [](MKLDNNNodePtr node) {
@@ -1118,7 +1122,9 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph)
     auto isSutableParentNode = [](MKLDNNNodePtr node) {
         return node->getType() == Convolution &&
                node->getChildEdges().size() == 1 &&
-               node->getCnnLayer()->precision == Precision::FP32;
+               node->getCnnLayer()->precision == Precision::FP32 &&
+             !(node->getCnnLayer()->insData[0].lock()->getPrecision() == Precision::BF16 &&
+               node->getCnnLayer()->outData[0]->getPrecision() == Precision::FP32);
     };
 
     auto isSutableChildNode = [&](MKLDNNNodePtr node) {
@@ -2233,4 +2239,4 @@ void MKLDNNGraphOptimizer::FuseScaleShiftAndQuantize(MKLDNNGraph &graph) {
             graph.DropNode(parent);
         }
     }
-}
\ No newline at end of file
+}
index e30f1c4..ad1bc55 100644 (file)
@@ -128,7 +128,8 @@ void MKLDNNMemory::SetData(const MKLDNNMemory& memory, bool ftz) const {
     mkldnn::reorder reorderPrim(memory.GetPrimitive(), GetPrimitive());
     mkldnn::stream(stream::kind::eager).submit({reorderPrim});
 
-    if (ftz && memory.GetDataType() == mkldnn::memory::f32 && GetFormat() != mkldnn::memory::wino_fmt) {
+    if (ftz && memory.GetDataType() == mkldnn::memory::f32 && GetFormat() != mkldnn::memory::wino_fmt &&
+        GetDataType() != mkldnn::memory::bf16) {
         // Internal blobs haven't strides yet.
         auto *memData = static_cast<float *>(GetData());
         memData += prim->get_primitive_desc().desc().data.layout_desc.blocking.offset_padding;
@@ -540,6 +541,9 @@ MKLDNNMemoryDesc::operator InferenceEngine::TensorDesc() const {
         case mkldnn_bin:
             precision = Precision::BIN;
             break;
+        case mkldnn_bf16:
+            precision = Precision::BF16;
+            break;
         default:
             THROW_IE_EXCEPTION << "Cannot cast to TensorDesc. Unsupported precision!";
     }
@@ -984,6 +988,9 @@ MKLDNNMemoryDesc::MKLDNNMemoryDesc(const TensorDesc& tDesc):
         case Precision::BOOL:
             data_type = mkldnn::memory::data_type::u8;
             break;
+        case Precision::BF16:
+            data_type = mkldnn::memory::data_type::bf16;
+            break;
         default:
             THROW_IE_EXCEPTION << "Cannot create MKLDNNMemoryDesc from TensorDesc. Unsupported precision!";
     }
index 99aaf86..e1c0920 100644 (file)
@@ -662,6 +662,8 @@ InferenceEngine::Blob::Ptr MKLDNNNode::createInternalBlob(InferenceEngine::SizeV
         internalBlob = InferenceEngine::make_shared_blob<int8_t>(desc);
     } else if (blb->getTensorDesc().getPrecision() == Precision::I32) {
         internalBlob = InferenceEngine::make_shared_blob<int32_t>(desc);
+    } else if (blb->getTensorDesc().getPrecision() == Precision::BF16) {
+        internalBlob = InferenceEngine::make_shared_blob<int16_t>(desc);
     } else {
         internalBlob = InferenceEngine::make_shared_blob<float>(desc);
     }
index 27ec34b..df73e8f 100644 (file)
@@ -15,7 +15,6 @@
 #include <ie_system_conf.h>
 #include <generic_ie.hpp>
 
-#include "cnn_network_ngraph_impl.hpp"
 #include "convert_function_to_cnn_network.hpp"
 #include <transformations/convert_opset1_to_legacy/convert_opset1_to_legacy.hpp>
 #include <transformations/convert_opset2_to_opset1/convert_opset2_to_opset1.hpp>
@@ -87,24 +86,21 @@ Engine::LoadExeNetworkImpl(const ICore * /*core*/, const InferenceEngine::ICNNNe
 
     std::shared_ptr<ICNNNetwork> clonedNetwork(nullptr);
 
-    if (auto networkNGraph = dynamic_cast<const CNNNetworkNGraphImpl*>(&network)) {
-        auto nGraphNetwork = networkNGraph->cloneNGraphImpl();
-        if (!nGraphNetwork->getFunction()) {
-            clonedNetwork = nGraphNetwork->getCNNNetwork();
-        } else {
-            const auto transformations_callback = [](const std::shared_ptr<const ::ngraph::Node> &node) -> bool {
-                return std::dynamic_pointer_cast<const ::ngraph::opset2::Gelu>(node) ||
-                       std::dynamic_pointer_cast<const ::ngraph::opset2::BatchToSpace>(node) ||
-                       std::dynamic_pointer_cast<const ::ngraph::opset2::SpaceToBatch>(node);
-            };
-            // Disable shape inference (WA for generic operations)
-            ::ngraph::op::GenericIE::DisableReshape noReshape(nGraphNetwork->getFunction());
-
-            // Note: instead of running all Conversion Transformations you can make up your own transformation pipeline
-            ngraph::pass::ConvertOpSet2ToOpSet1(transformations_callback).run_on_function(nGraphNetwork->getFunction());
-            ngraph::pass::ConvertOpSet1ToLegacy(transformations_callback).run_on_function(nGraphNetwork->getFunction());
-            clonedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(nGraphNetwork->getFunction(), *nGraphNetwork.get());
-        }
+    if (network.getFunction()) {
+        const auto transformations_callback = [](const std::shared_ptr<const ::ngraph::Node> &node) -> bool {
+            return std::dynamic_pointer_cast<const ::ngraph::opset2::Gelu>(node) ||
+                std::dynamic_pointer_cast<const ::ngraph::opset2::BatchToSpace>(node) ||
+                std::dynamic_pointer_cast<const ::ngraph::opset2::SpaceToBatch>(node);
+        };
+        CNNNetwork net(network.getFunction());
+        auto nGraphFunc = net.getFunction();
+        // Disable shape inference (WA for generic operations)
+        ::ngraph::op::GenericIE::DisableReshape noReshape(nGraphFunc);
+
+        // Note: instead of running all Conversion Transformations you can make up your own transformation pipeline
+        ngraph::pass::ConvertOpSet2ToOpSet1(transformations_callback).run_on_function(nGraphFunc);
+        ngraph::pass::ConvertOpSet1ToLegacy(transformations_callback).run_on_function(nGraphFunc);
+        clonedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(nGraphFunc, network);
     } else {
         clonedNetwork = cloneNet(network);
     }
index 41f1185..c514164 100644 (file)
@@ -128,7 +128,13 @@ protected:
             }
 
             // All extension layers support only FP32 precision!
+            // fixing of BF16 precisions where they are - layers naturally support only FP32
+            // if we see BF16, that means another floating point format which will be converted by reorder
+            // added by current mkl-dnn cpu plugin when it figure out diff in data types on input and output of edges
             InferenceEngine::Precision precision = data_desc.getPrecision();
+            if (precision == Precision::BF16) {
+                precision = Precision::FP32;
+            }
             if (conf.layout == ConfLayout::ANY) {
                 dataConfig.desc = TensorDesc(precision, data_dims, InferenceEngine::Layout::ANY);
             } else {
index 252c8c0..0213857 100644 (file)
@@ -159,11 +159,12 @@ public:
                 THROW_IE_EXCEPTION << "Interp supports only 4d blobs!";
 
             auto src_precision = inData->getTensorDesc().getPrecision();
-            if (src_precision != Precision::FP32 && src_precision != Precision::U8)
-                THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only U8 or FP32 are supported!";
+            if (src_precision != Precision::FP32 && src_precision != Precision::U8 && src_precision != Precision::BF16)
+                THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only U8 or FP32 or BF16 are supported!";
 
-            if (layer->outData[0]->getTensorDesc().getPrecision() != Precision::FP32)
-                THROW_IE_EXCEPTION << layer->name << " Incorrect output data tensor precision. Only FP32 is supported!";
+            auto dst_precision = layer->outData[0]->getTensorDesc().getPrecision();
+            if (dst_precision != Precision::FP32 && dst_precision != Precision::BF16)
+                THROW_IE_EXCEPTION << layer->name << " Incorrect output data tensor precision. Only FP32 or BF16 are supported!";
 
             // We don't read other parameters since they are needed only for dst reshape in caffe
             pad_beg = layer->GetParamAsInt("pad_beg");
@@ -197,14 +198,16 @@ public:
                 if (mayiuse(avx512_common)) {
                     blk_layout = ConfLayout::BLK16;
                     interp_kernel.reset(new jit_uni_interp_kernel_f32<avx512_common>());
+                    addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
                 } else if (mayiuse(avx2)) {
                     blk_layout = ConfLayout::BLK8;
                     interp_kernel.reset(new jit_uni_interp_kernel_f32<avx2>());
+                    addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
                 } else {
                     blk_layout = ConfLayout::BLK8;
                     interp_kernel.reset(new jit_uni_interp_kernel_f32<sse42>());
+                    addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
                 }
-                addConfig(layer, { DataConfigurator(blk_layout) }, { DataConfigurator(blk_layout) });
             }
         } catch (InferenceEngine::details::InferenceEngineException &ex) {
             errorMsg = ex.what();
@@ -258,8 +261,10 @@ public:
         case Precision::FP32:
         {
             const float* src_data = inputs[0]->cbuffer().as<const float *>() + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-            size_t IC = inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[1] *
-                        inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[4];
+            size_t IC = (inputs[0]->getTensorDesc().getLayout() == Layout::BLOCKED)
+                ? inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[1] *
+                inputs[0]->getTensorDesc().getBlockingDesc().getBlockDims()[4]
+                : IC = inputs[0]->getTensorDesc().getDims()[1];
             interpolate(IN, IC, src_data,
                 -pad_beg, -pad_beg, IH_pad, IW_pad, IH, IW, dst_data, 0, 0, OH, OW, OH, OW);
         }
@@ -312,10 +317,12 @@ private:
         }
 
         int block_size = 1;
-        if (mayiuse(avx512_common)) {
-            block_size = 16;
-        } else {
-            block_size = 8;
+        if (interp_kernel) {
+            if (mayiuse(avx512_common)) {
+                block_size = 16;
+            } else {
+                block_size = 8;
+            }
         }
 
         // Align channel number to block size to deal with channels padding in IE with multiple blobs
@@ -358,14 +365,21 @@ private:
 
                         float *pdst = pdst_h + w * block_size;
 
-                        arg.src00 = psrc00;
-                        arg.src01 = psrc01;
-                        arg.src10 = psrc10;
-                        arg.src11 = psrc11;
-                        arg.dst = pdst;
-                        arg.w_lambda0 = static_cast<float*>(&w_lambda0);
-                        arg.w_lambda1 = static_cast<float*>(&w_lambda1);
-                        (*interp_kernel)(&arg);
+                        if (interp_kernel) {
+                            arg.src00 = psrc00;
+                            arg.src01 = psrc01;
+                            arg.src10 = psrc10;
+                            arg.src11 = psrc11;
+                            arg.dst = pdst;
+                            arg.w_lambda0 = static_cast<float*>(&w_lambda0);
+                            arg.w_lambda1 = static_cast<float*>(&w_lambda1);
+                            (*interp_kernel)(&arg);
+                        } else {
+                            for (int c = 0; c < block_size; ++c) {
+                                pdst[c] = h_lambda1 * (w_lambda1 * psrc00[c] + w_lambda0 * psrc01[c]) +
+                                    h_lambda0 * (w_lambda1 * psrc10[c] + w_lambda0 * psrc11[c]);
+                            }
+                        }
                     }
         });
     }
index 4520d70..58b60af 100644 (file)
@@ -91,6 +91,17 @@ bool MKLDNNConvolutionNode::canBeExecutedInInt8() {
     }
 }
 
+InferenceEngine::Precision MKLDNNConvolutionNode::fusedEltwisePrecision(MKLDNNEltwiseNode *eltwiseNode, int findex) {
+    InferenceEngine::Precision eltwisePrecision;
+    auto parent0 = eltwiseNode->getCnnLayer()->insData[0].lock()->getCreatorLayer().lock();
+    auto parent1 = eltwiseNode->getCnnLayer()->insData[1].lock()->getCreatorLayer().lock();
+
+    auto fusedParent = findex != 0 ? fusedWith[findex - 1].get()->getCnnLayer() : this->getCnnLayer();
+    eltwisePrecision = fusedParent == parent0 ? eltwiseNode->getCnnLayer()->insData[1].lock()->getPrecision() :
+        eltwiseNode->getCnnLayer()->insData[0].lock()->getPrecision();
+    return eltwisePrecision;
+}
+
 void MKLDNNConvolutionNode::getSupportedDescriptors() {
     if (!descs.empty())
         return;
@@ -116,17 +127,11 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
 
         // We need to make sure that convolution output and second input of fused Eltwise operation
         // have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32.
-        if (outputDataType != memory::f32 && isFusedWith(Eltwise)) {
+        if (outputDataType != memory::f32 && outputDataType != memory::bf16 && isFusedWith(Eltwise)) {
             for (int i = 0; i < fusedWith.size(); i++) {
                 auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
                 if (eltwiseNode) {
-                    auto parent0 = eltwiseNode->getCnnLayer()->insData[0].lock()->getCreatorLayer().lock();
-                    auto parent1 = eltwiseNode->getCnnLayer()->insData[1].lock()->getCreatorLayer().lock();
-
-                    auto fusedParent = i != 0 ? fusedWith[i-1].get()->getCnnLayer() : this->getCnnLayer();
-                    eltwisePrecision = fusedParent == parent0 ? eltwiseNode->getCnnLayer()->insData[1].lock()->getPrecision() :
-                                                                eltwiseNode->getCnnLayer()->insData[0].lock()->getPrecision();
-
+                    eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
                     if (MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType).size() != eltwisePrecision.size()) {
                         eltwisePrecision = Precision::FP32;
                         outputDataType = memory::f32;
@@ -274,10 +279,29 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
                 getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::ndhwc : memory::nhwc);
         createDescriptor({in_candidate}, {out_candidate});
     } else {
-        // If the weights aren't quantized, the only precision we support is FP32
-        inputDataType = memory::f32;
-        outputDataType = memory::f32;
+        inputDataType = convLayer->input()->getPrecision() == Precision::BF16 ? memory::bf16 : memory::f32;
+        outputDataType = convLayer->outData[0]->getPrecision() == Precision::BF16 ? memory::bf16 : memory::f32;
         eltwisePrecision = Precision::FP32;
+        for (int i = 0; i < fusedWith.size(); i++) {
+            auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
+            if (eltwiseNode) {
+                eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
+                // TODO(amalyshe): there might be situation when convolution can be executed in BF16,
+                // output is required in FP32 but eltwise inplace tensor would be in BF16
+                // currently we forcedly change output to the BF16 that will add reoreder after the node
+                // Another situation can be when we mark output as FP32 and Eltwise asPrecison (which stand
+                // for input of inplace tensor precision) to FP32. This will add reorder for that in-place tensor
+                // bofore the fused convolution. This behaviour might be more correct regarding expected markup
+                // of the graph but performance of first and second approaches might be different. Need to verify
+                outputDataType = eltwisePrecision == Precision::BF16 ? memory::bf16 : memory::f32;
+            }
+        }
+        // correction for cases of FP32 input - we do not have FP32 convolution supported BF16 output
+        if (inputDataType == memory::f32
+            && (outputDataType == memory::bf16 || eltwisePrecision == Precision::BF16)) {
+            outputDataType = memory::f32;
+            eltwisePrecision = Precision::FP32;
+        }
 
         Layout layout = convLayer->input()->getLayout();
 
@@ -628,6 +652,9 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
 
     mkldnn::memory::data_type wdt = precisionToDataType(inDesc.getPrecision());
     mkldnn::memory::data_type bdt = precisionToDataType(inDesc.getPrecision());
+    if (inDesc.getPrecision() == Precision::BF16) {
+        bdt = mkldnn::memory::data_type::f32;
+    }
 
     if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
         wdt = memory::s8;
@@ -739,7 +766,8 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
     // Works only for FP32 convolutions for now.
     bool isStridedBlobsSupported = true;
     for (auto &insData : getCnnLayer()->insData) {
-        if (insData.lock()->getPrecision() != InferenceEngine::Precision::FP32) {
+        if (insData.lock()->getPrecision() != InferenceEngine::Precision::FP32
+            && insData.lock()->getPrecision() != InferenceEngine::Precision::BF16) {
             isStridedBlobsSupported = false;
             break;
         }
index 827223e..232803b 100644 (file)
@@ -12,6 +12,8 @@
 
 namespace MKLDNNPlugin {
 
+class MKLDNNEltwiseNode;
+
 class MKLDNNConvolutionNode : public MKLDNNNode {
 public:
     MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, int socket);
@@ -50,6 +52,7 @@ public:
 
 protected:
     void addScaleToPrimitiveAttr(mkldnn::primitive_attr attr) const;
+    InferenceEngine::Precision fusedEltwisePrecision(MKLDNNEltwiseNode *eltwiseNode, int findex);
 
 private:
     mkldnn::memory::data_type precisionToDataType(InferenceEngine::Precision prec);
index d129ab5..a5ddce2 100644 (file)
@@ -511,6 +511,11 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
                 inputDT = MKLDNNExtensionUtils::IEPrecisionToDataType(in_prec);
             }
 
+            if (inputDT == memory::bf16 || outputDT == memory::bf16) {
+                inputDT = memory::f32;
+                outputDT = memory::f32;
+            }
+
             auto impl_desc = initDesc(inputDT, outputDT, format);
 
             if (impl_desc.getImplementationType() != impl_desc_type::undef) {
index cfe8a33..9748ae6 100644 (file)
@@ -78,7 +78,8 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
         }
         auto weightsDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->insData[1].lock()->getPrecision());
 
-        if (inputDataType != memory::u8 || weightsDataType != memory::s8) {
+        // TODO(amalyse) what are the cases when we have non i8 weights and have to overide the precisions?
+        if ((inputDataType != memory::u8 || weightsDataType != memory::s8) && inputDataType != memory::bf16) {
             inputDataType = memory::f32;
             outputDataType = memory::f32;
         }
@@ -355,6 +356,9 @@ void MKLDNNFullyConnectedNode::createDescriptor(const std::vector<InferenceEngin
     TensorDesc inDesc = inputDesc[0], outDesc = outputDesc[0];
     mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision());
     mkldnn::memory::data_type bdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision());
+    if (inDesc.getPrecision() == Precision::BF16) {
+        bdt = mkldnn::memory::data_type::f32;
+    }
 
     if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
         wdt = memory::s8;
index d89f800..4c56bd0 100644 (file)
@@ -48,8 +48,9 @@ void MKLDNNInputNode::initSupportedPrimitiveDescriptors() {
     memory::format outFormat = mkldnn::memory::format_undef;
     if (getType() == Input || getType() == MemoryInput) {
         precision = getCnnLayer()->outData[0]->getPrecision();
-        if (precision == InferenceEngine::Precision::U16 || isMeanImage)
+        if (precision == InferenceEngine::Precision::U16 || isMeanImage) {
             precision = InferenceEngine::Precision::FP32;
+        }
         auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
         InferenceEngine::DataConfig dataConfig;
         dataConfig.inPlace = -1;
index 88721cc..f1d0cb0 100644 (file)
@@ -19,7 +19,7 @@ void MKLDNNLrnNode::getSupportedDescriptors() {
     if (!descs.empty())
         return;
     InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
+    if (precision != InferenceEngine::Precision::FP32 && precision != InferenceEngine::Precision::BF16)
         precision = InferenceEngine::Precision::FP32;
     auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
     auto * lrnLayer = dynamic_cast<NormLayer*>(getCnnLayer().get());
index 5ab8352..870a8f6 100644 (file)
@@ -41,7 +41,8 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
     inputPrecision = getCnnLayer()->insData[0].lock()->getPrecision();
     outputPrecision = getCnnLayer()->outData[0]->getPrecision();
     // Dirty WA to support stat based quantization approach
-    if (this->getCnnLayer()->precision != Precision::I8) {
+    if (this->getCnnLayer()->precision != Precision::I8
+        && inputPrecision != Precision::BF16) {
         if (type == PoolingLayer::MAX) {
             // MKLDNN supports only equal precisions for input and output
             outputPrecision = inputPrecision;
@@ -49,6 +50,9 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
             outputPrecision = Precision::FP32;
         }
     }
+    if (inputPrecision == Precision::BF16) {
+        outputPrecision = inputPrecision;
+    }
 
     if (!fusedWith.empty()) {
         auto lastFusedLayer = fusedWith[fusedWith.size() - 1].get()->getCnnLayer();
@@ -84,6 +88,10 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
         MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
         MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
         createDescriptor({ in_candidate }, { out_candidate });
+    } else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && (inputDataType == memory::bf16 || outputDataType == memory::bf16)) {
+        MKLDNNMemoryDesc in_candidate{ parentDims, memory::bf16, parentDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c};
+        MKLDNNMemoryDesc out_candidate{ childDims, memory::bf16, parentDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c};
+        createDescriptor({ in_candidate }, { out_candidate });
     } else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && parentDims[1] == 1) {
         inputDataType = memory::f32;
         outputDataType = memory::f32;
@@ -92,8 +100,10 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
         MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw};
         createDescriptor({ in_candidate }, { out_candidate });
     } else {
-        inputDataType = memory::f32;
-        outputDataType = memory::f32;
+        if (inputDataType != memory::bf16) {
+            inputDataType = memory::f32;
+            outputDataType = memory::f32;
+        }
         // It doesn't support any format
         for (auto format : getAvailableFormatsForDims(parentDims)) {
             MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, format};
index e49c4fc..5ef5eef 100644 (file)
@@ -13,6 +13,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "bf16transformer.h"
 
 using namespace mkldnn::impl::cpu;
 using namespace mkldnn::impl::utils;
@@ -333,9 +334,22 @@ public:
                 THROW_IE_EXCEPTION << "Normalize supports from 2D to 4D blobs!";
             }
 
-            weights = std::dynamic_pointer_cast<TBlob<float>>(layer->blobs.at("weights"));
-            if (!weights)
-                THROW_IE_EXCEPTION << layer->name << " weights is empty!";
+            MemoryBlob::Ptr tweights = as<MemoryBlob>(layer->blobs.at("weights"));
+            if (!tweights) {
+                THROW_IE_EXCEPTION << layer->name << "Weights are not initialized or cannot be casted to MemoryBlob for layer Normalize with name '"
+                    << layer->name << "'";
+            }
+
+            if (tweights->getTensorDesc().getPrecision() == Precision::FP32) {
+                weights = tweights;
+            } else if (tweights->getTensorDesc().getPrecision() == Precision::BF16) {
+                MKLDNNPlugin::BF16Transformer transformer;
+                weights = transformer.convertBF16ToFloat(tweights);
+            } else {
+                // Unknown non supported data type, return an error
+                THROW_IE_EXCEPTION << layer->name << "Weights for layer Normalize wiht name '" << layer->name <<
+                    "' has unsupported data type " << tweights->getTensorDesc().getPrecision();
+            }
             across_spatial = layer->GetParamAsBool("across_spatial", false);
             channel_shared = layer->GetParamAsBool("channel_shared", false);
             eps = layer->GetParamAsFloat("eps");
@@ -514,7 +528,7 @@ private:
     std::shared_ptr<jit_uni_normalize_across_spatial_kernel> normalize_across_spatial_kernel;
     std::shared_ptr<jit_uni_sqr_sum_kernel> sqr_sum_kernel;
 
-    TBlob<float>::Ptr weights;
+    MemoryBlob::Ptr weights;
     bool across_spatial = true;
     bool channel_shared = true;
     float eps = 1e-10f;
index 9a9a4e5..04aa8be 100644 (file)
@@ -31,21 +31,26 @@ public:
             if (layer->outData.size() != 1 && layer->outData.size() != 2)
                 THROW_IE_EXCEPTION << layer->name << " Incorrect number of output edges!";
 
-            if (layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32 ||
+            // DataConfigurator::addConfig will automatically change BF16 datatype to FP32
+            // it can be changed back by explicit modification like confs.back().outConfs[i].desc.setPrecision(Precision::BF16);
+            // if current layer supports BF16 naturally. usually they are not and nothing special is not required
+            if ((layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32 &&
+                layer->insData[TOPK_DATA].lock()->getTensorDesc().getPrecision() != Precision::BF16) ||
                 layer->insData[TOPK_K].lock()->getTensorDesc().getPrecision() != Precision::I32)
-                THROW_IE_EXCEPTION << layer->name << " Incorrect input data/index values precision.";
+                THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect input data/index values precision.";
 
             if (layer->insData[TOPK_K].lock()->getTensorDesc().getDims().size() > 1)
-                THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension";
+                THROW_IE_EXCEPTION << layer->name << " TopKImpl - Index vector should be 1 dimension";
 
             SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims();
             SizeVector src_data_dims = layer->insData[TOPK_DATA].lock()->getTensorDesc().getDims();
             if (src_data_dims.size() != dst_dims.size())
-                THROW_IE_EXCEPTION << layer->name << " Incorrect input/output tensor dimension sizes";
+                THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect input/output tensor dimension sizes";
 
             if (layer->outData.size() == 2) {
-                if (layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::FP32)
-                    THROW_IE_EXCEPTION << layer->name << " Incorrect output data tensor precision. Only FP32 is supported!";
+                if (layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::FP32 &&
+                    layer->outData[TOPK_VALUE]->getTensorDesc().getPrecision() != Precision::BF16)
+                    THROW_IE_EXCEPTION << layer->name << " TopKImpl - Incorrect output data tensor precision. Floating point datatypes are supported!";
 
                 SizeVector dst_idx_dims = layer->outData[TOPK_INDEX]->getTensorDesc().getDims();
                 if (dst_dims.size() != dst_idx_dims.size())
index c6b360f..786f1d7 100644 (file)
@@ -122,11 +122,11 @@ static Blob::Ptr prepare_plain_data(Blob::Ptr blob) {
             break;
         }
         case Precision::I16:
-        case Precision::U16: {
-            auto *pln_blob_ptr = pln_blob->buffer().as<int16_t*>();
+        case Precision::U16:
+        case Precision::BF16: {
+            auto *pln_blob_ptr = pln_blob->buffer().as<int16_t *>();
             auto *blob_ptr = blob->buffer().as<int16_t *>();
-            for (size_t i = 0; i < data_size; i++)
-                pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
+            for (size_t i = 0; i < data_size; i++) pln_blob_ptr[i] = blob_ptr[blob_wrp.off_l(i)];
             break;
         }
         case Precision::I8:
@@ -187,7 +187,8 @@ void BlobDumper::dumpAsTxt(std::ostream &stream) {
            << dims.size() << "D "
            << "shape: ";
     for (size_t d : dims) stream << d << " ";
-    stream << "(" << _blob->size() << ")" <<std::endl;
+    stream << "(" << _blob->size() << ")" <<
+    " by address 0x" << std::hex << _blob->buffer().as<long long>() << std::dec <<std::endl;
 
     // Dump data
     MKLDNNMemoryDesc mdesc(_blob->getTensorDesc());
@@ -202,6 +203,17 @@ void BlobDumper::dumpAsTxt(std::ostream &stream) {
                 stream << blob_ptr[blob_wrp.off_l(i)] << std::endl;
             break;
         }
+        case Precision::BF16:
+        {
+            auto *blob_ptr = _blob->buffer().as<int16_t *>();
+            for (size_t i = 0; i < data_size; i++) {
+                int i16n = blob_ptr[blob_wrp.off_l(i)];
+                i16n = i16n << 16;
+                float fn = *(reinterpret_cast<float *>(&i16n));
+                stream << fn << std::endl;
+            }
+            break;
+        }
         case Precision::I32: {
             auto *blob_ptr = _blob->buffer().as<int32_t*>();
             for (size_t i = 0; i < data_size; i++)
index b184940..bcddeab 100644 (file)
@@ -117,6 +117,7 @@ InferenceEngine::Blob::Ptr make_blob_with_precision(InferenceEngine::Precision p
         USE_FACTORY(I64);
         USE_FACTORY(U64);
         USE_FACTORY(BIN);
+        USE_FACTORY(BF16);
         USE_FACTORY(BOOL);
     default:
         THROW_IE_EXCEPTION << "cannot locate blob for precision: " << precision;
index c53fbc6..db3ea04 100644 (file)
@@ -40,6 +40,21 @@ namespace InferenceEngine {
  * @snippet example_async_infer_request.cpp async_infer_request:define_pipeline
  */
 class AsyncInferRequestThreadSafeDefault : public AsyncInferRequestThreadSafeInternal {
+    using AtomicCallback = std::atomic<IInferRequest::CompletionCallback>;
+    using Futures = std::vector<std::shared_future<void>>;
+    using Promise = std::shared_ptr<std::promise<void>>;
+    enum Stage_e : std::uint8_t { executor, task };
+    struct DisableCallbackGuard{
+        explicit DisableCallbackGuard(AtomicCallback& callback)
+            : _callbackRef(callback), _callback(callback.exchange(nullptr)) {}
+        ~DisableCallbackGuard() {
+            _callbackRef = _callback;
+        }
+        AtomicCallback& _callbackRef;
+        IInferRequest::CompletionCallback _callback;
+    };
+    InferRequestInternal::Ptr _syncRequest;
+
 public:
     /**
      * @brief A shared pointer to AsyncInferRequestThreadSafeDefault
@@ -47,7 +62,7 @@ public:
     using Ptr = std::shared_ptr<AsyncInferRequestThreadSafeDefault>;
 
     /**
-     * @brief      Wraps a InferRequestInternal::Ptr implementation and constructs a 
+     * @brief      Wraps a InferRequestInternal::Ptr implementation and constructs a
      * AsyncInferRequestThreadSafeDefault::_pipeline where `taskExecutor` is used to run InferRequestInternal::Infer
      * asynchronously.
      *
@@ -58,12 +73,11 @@ public:
     AsyncInferRequestThreadSafeDefault(const InferRequestInternal::Ptr& request,
                                        const ITaskExecutor::Ptr& taskExecutor,
                                        const ITaskExecutor::Ptr& callbackExecutor)
-        : _requestExecutor {taskExecutor},
+        : _syncRequest {request},
+          _requestExecutor {taskExecutor},
           _callbackExecutor {callbackExecutor},
-          _syncRequest {request} {
-        _pipeline = {
-            { _requestExecutor, [this] { _syncRequest->Infer(); } }
-        };
+          _pipeline {{taskExecutor, [this] {_syncRequest->Infer();}}},
+          _syncPipeline{{std::make_shared<ImmediateExecutor>(), [this] {_syncRequest->Infer();}}} {
     }
 
     /**
@@ -140,9 +154,12 @@ protected:
      * @brief Creates and run the first stage task. If destructor was not called add a new std::future to the
      * AsyncInferRequestThreadSafeDefault::_futures list that would be used to wait
      * AsyncInferRequestThreadSafeDefault::_pipeline finish
+     * @param[in]  itBeginStage Iterator to begin of pipeline
+     * @param[in]  itEndStage End pipeline iterator
+     * @param[in]  callbackExecutor Final or error stage executor
      */
-    void RunFirstStage() {
-        _itStage = _pipeline.begin();
+    void RunFirstStage(const Pipeline::iterator itBeginStage, const Pipeline::iterator itEndStage,
+                       const ITaskExecutor::Ptr callbackExecutor = {}) {
         _promise = {};
         bool stop = [&] {
             std::lock_guard<std::mutex> lock(_mutex);
@@ -165,9 +182,9 @@ protected:
 
         if (!stop) {
             try {
-                auto& firstStageExecutor = std::get<Stage_e::executor>(*_itStage);
+                auto& firstStageExecutor = std::get<Stage_e::executor>(*itBeginStage);
                 IE_ASSERT(nullptr != firstStageExecutor);
-                firstStageExecutor->run(MakeNextStageTask());
+                firstStageExecutor->run(MakeNextStageTask(itBeginStage, itEndStage, std::move(callbackExecutor)));
             } catch (...) {
                 _promise.set_exception(std::current_exception());
                 throw;
@@ -199,31 +216,34 @@ protected:
      * @brief Implements Infer() using StartAsync() and Wait()
      */
     void InferUsingAsync() {
-        struct CallbackStorage {
-            explicit CallbackStorage(AtomicCallback& callback)
-                : _callbackRef(callback), _callback(callback.exchange(nullptr)) {}
-            ~CallbackStorage() {
-                _callbackRef = _callback;
-            }
-            AtomicCallback& _callbackRef;
-            IInferRequest::CompletionCallback _callback;
-        } storage {_callback};
+        DisableCallbackGuard disableCallbackGuard{_callback};
         StartAsync_ThreadUnsafe();
         Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
     }
 
-    ITaskExecutor::Ptr _requestExecutor;  //!< Used to run inference CPU tasks
-    ITaskExecutor::Ptr _callbackExecutor;  //!< Used to run post inference callback
+    /**
+     * @brief Implements Infer() using synchronous pipeline and Wait()
+     */
+    void InferUsingSync() {
+        DisableCallbackGuard disableCallbackGuard{_callback};
+        _syncRequest->checkBlobs();
+        RunFirstStage(_syncPipeline.begin(), _syncPipeline.end(), _syncCallbackExecutor);
+        Wait(InferenceEngine::IInferRequest::WaitMode::RESULT_READY);
+    }
+
+    ITaskExecutor::Ptr _requestExecutor;  //!< Used to run inference CPU tasks.
+    ITaskExecutor::Ptr _callbackExecutor;  //!< Used to run post inference callback in asynchronous pipline
+    ITaskExecutor::Ptr _syncCallbackExecutor;  //!< Used to run post inference callback in synchronous pipline
     Pipeline _pipeline;  //!< Pipeline variable that should be filled by inherited class.
+    Pipeline _syncPipeline;  //!< Synchronous pipeline variable that should be filled by inherited class.
 
     void StartAsync_ThreadUnsafe() override {
         _syncRequest->checkBlobs();
-        RunFirstStage();
+        RunFirstStage(_pipeline.begin(), _pipeline.end(), _callbackExecutor);
     }
 
     void Infer_ThreadUnsafe() override {
-        _syncRequest->checkBlobs();
-        _syncRequest->InferImpl();
+        InferUsingSync();
     }
 
     void GetPerformanceCounts_ThreadUnsafe(std::map<std::string, InferenceEngineProfileInfo>& perfMap) const override {
@@ -264,38 +284,35 @@ protected:
     }
 
 private:
-    using AtomicCallback = std::atomic<IInferRequest::CompletionCallback>;
-    using Futures = std::vector<std::shared_future<void>>;
-    using Promise = std::shared_ptr<std::promise<void>>;
-    enum Stage_e : std::uint8_t { executor, task };
-
     /**
      * @brief Create a task with next pipeline stage.
-     *        Each call to MakeNextStageTask() generates `InferenceEngine::Task` objects for each stage.
-     *        When stage task is called it increments
-     *        `_stage` counter, call `_pipeline` task for this stage and generates next stage task using
-     * MakeNextStageTask() and pass it to executor. On last stage or if the exception is raised from `_pipeline` task
+     * Each call to MakeNextStageTask() generates @ref Task objects for each stage.
+     * On last stage or if the exception is raised from `_pipeline` task
      * the last stage task is called or passed to callback executor if it is presented. The last stage task call the
      * callback, if it is presented, capture the `_promise` member and use it to forward completion or exception to the
      * one of `_futures` member
+     * @param[in]  itStage Iterator to next stage of pipeline
+     * @param[in]  itEndStage End pipeline iterator
+     * @param[in]  callbackExecutor Executor that will run final stage with callback call
      * @return A next stage task
      */
-    Task MakeNextStageTask() {
-        return [this]() mutable {
+    Task MakeNextStageTask(const Pipeline::iterator itStage, const Pipeline::iterator itEndStage,
+                           const ITaskExecutor::Ptr callbackExecutor) {
+        return std::bind([this, itStage, itEndStage](ITaskExecutor::Ptr& callbackExecutor) mutable {
             StatusCode requestStatus = StatusCode::OK;
             std::exception_ptr localCurrentException = nullptr;
-            auto& thisStage = *_itStage;
-            auto copyItStage = ++_itStage;
+            auto& thisStage = *itStage;
+            auto itNextStage = itStage + 1;
 
             try {
                 auto& stageTask = std::get<Stage_e::task>(thisStage);
                 IE_ASSERT(nullptr != stageTask);
                 stageTask();
-                if (_pipeline.end() != _itStage) {
-                    auto nextStage = *_itStage;
+               if (itEndStage != itNextStage) {
+                    auto& nextStage = *itNextStage;
                     auto& nextStageExecutor = std::get<Stage_e::executor>(nextStage);
                     IE_ASSERT(nullptr != nextStageExecutor);
-                    nextStageExecutor->run(MakeNextStageTask());
+                    nextStageExecutor->run(MakeNextStageTask(itNextStage, itEndStage, std::move(callbackExecutor)));
                 }
             } catch (InferenceEngine::details::InferenceEngineException& ie_ex) {
                 requestStatus = ie_ex.hasStatus() ? ie_ex.getStatus() : StatusCode::GENERAL_ERROR;
@@ -305,7 +322,7 @@ private:
                 localCurrentException = std::current_exception();
             }
 
-            if ((_pipeline.end() == copyItStage) || (nullptr != localCurrentException)) {
+            if ((itEndStage == itNextStage) || (nullptr != localCurrentException)) {
                 auto lastStageTask = [this, requestStatus, localCurrentException]() mutable {
                     auto promise = std::move(_promise);
                     auto callback = _callback.load();
@@ -327,20 +344,18 @@ private:
                     }
                 };
 
-                if (nullptr == _callbackExecutor) {
+                if (nullptr == callbackExecutor) {
                     lastStageTask();
                 } else {
-                    _callbackExecutor->run(std::move(lastStageTask));
+                    callbackExecutor->run(std::move(lastStageTask));
                 }
             }
-        };
+        }, std::move(callbackExecutor));
     }
 
-    InferRequestInternal::Ptr _syncRequest;
     void* _userData = nullptr;
     AtomicCallback _callback = {nullptr};
     IInferRequest::Ptr _publicInterface;
-    Pipeline::iterator _itStage;
     std::promise<void> _promise;
     mutable std::mutex _mutex;
     Futures _futures;
index aa42a5d..41427d1 100644 (file)
@@ -72,4 +72,11 @@ INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512f();
  */
 INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_avx512_core();
 
+/**
+ * @brief      Checks whether CPU supports BFloat16 capability
+ * @ingroup    ie_dev_api_system_conf
+ * @return     `True` is tAVX512_BF16 instructions are available, `false` otherwise
+ */
+INFERENCE_ENGINE_API_CPP(bool) with_cpu_x86_bfloat16();
+
 }  // namespace InferenceEngine
index b2fb1a4..6f1525b 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 5fa4f27..f6b8898 100644 (file)
@@ -13,6 +13,7 @@
 #include <unordered_map>
 #include <vector>
 #include <utility>
+#include <mutex>
 
 #include "threading/ie_itask_executor.hpp"
 #include "threading/ie_istreams_executor.hpp"
@@ -40,6 +41,8 @@ public:
 private:
     std::unordered_map<std::string, ITaskExecutor::Ptr> executors;
     std::vector<std::pair<IStreamsExecutor::Config, IStreamsExecutor::Ptr> > cpuStreamsExecutors;
+    std::mutex streamExecutorMutex;
+    std::mutex taskExecutorMutex;
 };
 
 /**
@@ -59,13 +62,7 @@ public:
      * @brief      Returns a global instance of ExecutorManager
      * @return     The instance.
      */
-    static ExecutorManager* getInstance() {
-        if (!_instance) {
-            _instance = new ExecutorManager();
-        }
-
-        return _instance;
-    }
+    static ExecutorManager* getInstance();
 
     /**
      * @brief A deleted copy constructor
@@ -92,9 +89,6 @@ public:
      */
     size_t getExecutorsNumber();
 
-    /**
-     * @cond
-     */
     size_t getIdleCPUStreamsExecutorsNumber();
 
     void clear(const std::string& id = {});
@@ -106,7 +100,9 @@ private:
     ExecutorManager() {}
 
     ExecutorManagerImpl _impl;
-    static ExecutorManager* _instance;
+
+    static std::mutex _mutex;
+    static ExecutorManager *_instance;
 };
 
 }  // namespace InferenceEngine
index 628ea6a..d53c6e4 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 8ad5e97..a2163f9 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 3acee43..2a5da46 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index c3ad1e5..5943b29 100644 (file)
@@ -189,6 +189,6 @@ ie_developer_export_targets(${TARGET_NAME})
 # install
 
 install(TARGETS ${TARGET_NAME}
-        RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
-        ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+        RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+        ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
         LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
index db46c1b..cf121f4 100644 (file)
@@ -851,139 +851,17 @@ void calcRowLinear_32F(float *dst[],
 }
 
 //------------------------------------------------------------------------------
-namespace calcRowArea {
-// vertical pass
-template<typename T, typename A, typename I, typename W>
-static inline void downy(const T *src[], int inWidth, const MapperUnit<A, I>& ymap, A yalpha,
-                         W vbuf[]) {
-    int y_1st = ymap.index0;
-    int ylast = ymap.index1 - 1;
-
-    // yratio > 1, so at least 2 rows
-    GAPI_DbgAssert(y_1st < ylast);
-
-    // 1st and last rows
-    {
-        int w = 0;
-
-    #if CV_SIMD128
-        if (std::is_same<T, uint8_t>::value) {
-            for (; w <= inWidth - 8; w += 8) {
-                v_uint16x8 vsrc0 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[0][w]));
-                v_uint16x8 vsrc1 = v_load_expand(reinterpret_cast<const uint8_t*>(& src[ylast - y_1st][w]));
-                v_uint16x8 vres = v_mulhi(vsrc0 << 8, static_cast<Q0_16>(ymap.alpha0)) +
-                                  v_mulhi(vsrc1 << 8, static_cast<Q0_16>(ymap.alpha1));
-                v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
-            }
-        }
-    #endif
-
-        for (; w < inWidth; w++) {
-            vbuf[w] = mulas(ymap.alpha0, src[0][w])
-                    + mulas(ymap.alpha1, src[ylast - y_1st][w]);
-        }
-    }
-
-    // inner rows (if any)
-    for (int i = 1; i < ylast - y_1st; i++) {
-        int w = 0;
-
-    #if CV_SIMD128
-        if (std::is_same<T, uint8_t>::value) {
-            for (; w <= inWidth - 8; w += 8) {
-                v_uint16x8 vsrc = v_load_expand(reinterpret_cast<const uint8_t*>(& src[i][w]));
-                v_uint16x8 vres = v_load(reinterpret_cast<Q8_8*>(& vbuf[w]));
-                vres = vres + v_mulhi(vsrc << 8, static_cast<Q0_16>(yalpha));
-                v_store(reinterpret_cast<Q8_8*>(& vbuf[w]), vres);
-            }
-        }
-    #endif
-
-        for (; w < inWidth; w++) {
-            vbuf[w] += mulas(yalpha, src[i][w]);
-        }
-    }
-}
-
-// horizontal pass
-template<typename T, typename A, typename I, typename W>
-static inline void downx(T dst[], int outWidth, int xmaxdf, const I xindex[], const A xalpha[],
-                         const W vbuf[]) {
-#define HSUM(xmaxdf) \
-    for (int x = 0; x < outWidth; x++) { \
-        int      index =  xindex[x]; \
-        const A *alpha = &xalpha[x * xmaxdf]; \
-\
-        W sum = 0; \
-        for (int i = 0; i < xmaxdf; i++) { \
-            sum += mulaw(alpha[i], vbuf[index + i]); \
-        } \
-\
-        dst[x] = convert_cast<T>(sum); \
-    }
-
-    if (2 == xmaxdf) {
-        HSUM(2);
-    } else if (3 == xmaxdf) {
-        HSUM(3);
-    } else if (4 == xmaxdf) {
-        HSUM(4);
-    } else if (5 == xmaxdf) {
-        HSUM(5);
-    } else if (6 == xmaxdf) {
-        HSUM(6);
-    } else if (7 == xmaxdf) {
-        HSUM(7);
-    } else if (8 == xmaxdf) {
-        HSUM(8);
-    } else {
-        HSUM(xmaxdf);
-    }
-#undef HSUM
-}
-}  // namespace calcRowArea
-
-template<typename T, typename A, typename I, typename W>
-static void calcRowArea_impl_sse4(T dst[], const T *src[], const Size& inSz, const Size& outSz,
-    A yalpha, const MapperUnit<A, I>& ymap, int xmaxdf, const I xindex[], const A xalpha[],
-    W vbuf[]) {
-    bool xRatioEq1 = inSz.width  == outSz.width;
-    bool yRatioEq1 = inSz.height == outSz.height;
-
-    if (!yRatioEq1 && !xRatioEq1) {
-        calcRowArea::downy(src, inSz.width, ymap, yalpha, vbuf);
-        calcRowArea::downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
-
-    } else if (!yRatioEq1) {
-        GAPI_DbgAssert(xRatioEq1);
-        calcRowArea::downy(src, inSz.width, ymap, yalpha, vbuf);
-        for (int x = 0; x < outSz.width; x++) {
-            dst[x] = convert_cast<T>(vbuf[x]);
-        }
-
-    } else if (!xRatioEq1) {
-        GAPI_DbgAssert(yRatioEq1);
-        for (int w = 0; w < inSz.width; w++) {
-            vbuf[w] = convert_cast<W>(src[0][w]);
-        }
-        calcRowArea::downx(dst, outSz.width, xmaxdf, xindex, xalpha, vbuf);
-
-    } else {
-        GAPI_DbgAssert(xRatioEq1 && yRatioEq1);
-        memcpy(dst, src[0], outSz.width * sizeof(T));
-    }
-}
 
 void calcRowArea_8U(uchar dst[], const uchar *src[], const Size& inSz, const Size& outSz,
     Q0_16 yalpha, const MapperUnit8U &ymap, int xmaxdf, const short xindex[], const Q0_16 xalpha[],
     Q8_8 vbuf[]) {
-    calcRowArea_impl_sse4(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
+    calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }
 
 void calcRowArea_32F(float dst[], const float *src[], const Size& inSz, const Size& outSz,
     float yalpha, const MapperUnit32F& ymap, int xmaxdf, const int xindex[], const float xalpha[],
     float vbuf[]) {
-    calcRowArea_impl_sse4(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
+    calcRowArea_impl(dst, src, inSz, outSz, yalpha, ymap, xmaxdf, xindex, xalpha, vbuf);
 }
 
 //------------------------------------------------------------------------------
index a5513e0..667e9d6 100644 (file)
@@ -1244,7 +1244,67 @@ static void calcAreaRow(const cv::gapi::fluid::View& in, cv::gapi::fluid::Buffer
 
         auto dst = out.OutLine<T>(l);
 
-    #ifdef HAVE_SSE
+        #ifdef HAVE_AVX512
+        if (with_cpu_x86_avx512f()) {
+            if (std::is_same<T, uchar>::value) {
+                avx512::calcRowArea_8U(reinterpret_cast<uchar*>(dst),
+                                       reinterpret_cast<const uchar**>(src),
+                                       inSz, outSz,
+                                       static_cast<Q0_16>(ymapper.alpha),
+                                       reinterpret_cast<const MapperUnit8U&>(ymap),
+                                       xmaxdf[0],
+                                       reinterpret_cast<const short*>(xindex),
+                                       reinterpret_cast<const Q0_16*>(xalpha),
+                                       reinterpret_cast<Q8_8*>(vbuf));
+                continue;  // next l = 0, ..., lpi-1
+            }
+
+            if (std::is_same<T, float>::value) {
+                avx512::calcRowArea_32F(reinterpret_cast<float*>(dst),
+                                        reinterpret_cast<const float**>(src),
+                                        inSz, outSz,
+                                        static_cast<float>(ymapper.alpha),
+                                        reinterpret_cast<const MapperUnit32F&>(ymap),
+                                        xmaxdf[0],
+                                        reinterpret_cast<const int*>(xindex),
+                                        reinterpret_cast<const float*>(xalpha),
+                                        reinterpret_cast<float*>(vbuf));
+                continue;
+            }
+        }
+        #endif  // HAVE_AVX512
+
+        #ifdef HAVE_AVX2
+        if (with_cpu_x86_avx2()) {
+            if (std::is_same<T, uchar>::value) {
+                avx::calcRowArea_8U(reinterpret_cast<uchar*>(dst),
+                                    reinterpret_cast<const uchar**>(src),
+                                    inSz, outSz,
+                                    static_cast<Q0_16>(ymapper.alpha),
+                                    reinterpret_cast<const MapperUnit8U&>(ymap),
+                                    xmaxdf[0],
+                                    reinterpret_cast<const short*>(xindex),
+                                    reinterpret_cast<const Q0_16*>(xalpha),
+                                    reinterpret_cast<Q8_8*>(vbuf));
+                continue;  // next l = 0, ..., lpi-1
+            }
+
+            if (std::is_same<T, float>::value) {
+                avx::calcRowArea_32F(reinterpret_cast<float*>(dst),
+                                     reinterpret_cast<const float**>(src),
+                                     inSz, outSz,
+                                     static_cast<float>(ymapper.alpha),
+                                     reinterpret_cast<const MapperUnit32F&>(ymap),
+                                     xmaxdf[0],
+                                     reinterpret_cast<const int*>(xindex),
+                                     reinterpret_cast<const float*>(xalpha),
+                                     reinterpret_cast<float*>(vbuf));
+                continue;
+            }
+        }
+        #endif  // HAVE_AVX2
+
+        #ifdef HAVE_SSE
         if (with_cpu_x86_sse42()) {
             if (std::is_same<T, uchar>::value) {
                 calcRowArea_8U(reinterpret_cast<uchar*>(dst),
@@ -1272,7 +1332,7 @@ static void calcAreaRow(const cv::gapi::fluid::View& in, cv::gapi::fluid::Buffer
                 continue;
             }
         }
-    #endif  // HAVE_SSE
+        #endif  // HAVE_SSE
 
         // vertical pass
         int y_1st = ymap.index0;
index fd2509e..a5c806b 100644 (file)
@@ -1,7 +1,3 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
 #ifndef IE_PREPROCESS_GAPI_KERNELS_SIMD_IMPL_H
 #define IE_PREPROCESS_GAPI_KERNELS_SIMD_IMPL_H
 
index d9b54a7..f7f5f85 100644 (file)
@@ -33,7 +33,6 @@ target_include_directories(${TARGET_NAME} PUBLIC ${PUBLIC_HEADERS_DIR})
 target_include_directories(${TARGET_NAME} PUBLIC ${IE_MAIN_SOURCE_DIR}/include)
 
 add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
-add_clang_format_target(${TARGET_NAME}_clang_format FOR_TARGETS ${TARGET_NAME})
 
 # developer package
 
@@ -42,6 +41,6 @@ ie_developer_export_targets(${TARGET_NAME})
 # install
 
 install(TARGETS ${TARGET_NAME}
-        RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
-        ARCHIVE DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core
+        RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH} COMPONENT core
+        ARCHIVE DESTINATION ${IE_CPACK_ARCHIVE_PATH} COMPONENT core
         LIBRARY DESTINATION ${IE_CPACK_LIBRARY_PATH} COMPONENT core)
index 3b0e095..f03c05c 100644 (file)
@@ -39,7 +39,7 @@ void ngraph::pass::PullTransposeThroughFQUp::pull_transpose_through_fq() {
 
         auto input_shape = fq->input(0).get_source_output().get_shape();
 
-        std::vector<std::shared_ptr<ngraph::Node> > fq_inputs;
+        ngraph::OutputVector fq_inputs;
         for (size_t i = 0; i < fq->inputs().size(); ++i) {
             std::shared_ptr<ngraph::Node> fq_input;
             fq_input = fq->input(i).get_source_output().get_node_shared_ptr();
@@ -52,11 +52,11 @@ void ngraph::pass::PullTransposeThroughFQUp::pull_transpose_through_fq() {
                 fq_input = std::make_shared<ngraph::opset1::Unsqueeze>(fq_input,
                                                                        opset1::Constant::create(element::i64, Shape{unsqueeze_axes.size()}, unsqueeze_axes));
             }
-            fq_input = transpose->copy_with_new_args({fq_input, const_order});
+            fq_input = transpose->copy_with_new_inputs({fq_input, const_order});
             fq_inputs.push_back(fq_input);
         }
 
-        auto new_fq = fq->copy_with_new_args(fq_inputs);
+        auto new_fq = fq->copy_with_new_inputs(fq_inputs);
         new_fq->set_friendly_name(fq->get_friendly_name());
         ngraph::replace_node(transpose, new_fq);
 
@@ -65,4 +65,4 @@ void ngraph::pass::PullTransposeThroughFQUp::pull_transpose_through_fq() {
 
     auto m = std::make_shared<ngraph::pattern::Matcher>(transpose, "PullTransposeThroughFQUp");
     this->add_matcher(m, callback, PassProperty::CHANGE_DYNAMIC_STATE);
-}
\ No newline at end of file
+}
index 161e9f3..6c9d04f 100644 (file)
@@ -23,6 +23,8 @@ bool get_single_value(const std::shared_ptr<op::Constant>& const_node, float& va
         return util::normalize_single_value(const_node->get_vector<float16>(), value);
     case element::Type_t::f32:
         return util::normalize_single_value(const_node->get_vector<float>(), value);
+    case element::Type_t::bf16:
+        return util::normalize_single_value(const_node->get_vector<bfloat16>(), value);
     case element::Type_t::f64:
         return util::normalize_single_value(const_node->get_vector<double>(), value);
     case element::Type_t::i8:
index 6521529..43e9308 100644 (file)
@@ -47,6 +47,8 @@ function(add_common_target TARGET_NAME STATIC_IE)
 
         ie_developer_export_targets(${TARGET_NAME})
     endif()
+
+    target_link_libraries(${TARGET_NAME} PUBLIC ${NGRAPH_LIBRARIES} inference_engine_transformations)
 endfunction()
 
 add_common_target("vpu_common_lib" FALSE)
diff --git a/inference-engine/src/vpu/common/include/vpu/ngraph/operations/dynamic_shape_resolver.hpp b/inference-engine/src/vpu/common/include/vpu/ngraph/operations/dynamic_shape_resolver.hpp
new file mode 100644 (file)
index 0000000..50c8dfc
--- /dev/null
@@ -0,0 +1,28 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+
+#include <memory>
+
+namespace ngraph { namespace op {
+
+class DynamicShapeResolver : public Op {
+public:
+    static constexpr NodeTypeInfo type_info{"DynamicShapeResolver", 1};
+    const NodeTypeInfo& get_type_info() const override { return type_info; }
+
+    DynamicShapeResolver(const Output<Node>& tensorWithData, const Output<Node>& tensorWithDims);
+
+    void validate_and_infer_types() override;
+
+    std::shared_ptr<Node> copy_with_new_args(const NodeVector& new_args) const override;
+
+    bool visit_attributes(ngraph::AttributeVisitor& visitor) override;
+};
+
+}  // namespace op
+}  // namespace ngraph
diff --git a/inference-engine/src/vpu/common/include/vpu/ngraph/operations/static_shape_nonzero.hpp b/inference-engine/src/vpu/common/include/vpu/ngraph/operations/static_shape_nonzero.hpp
new file mode 100644 (file)
index 0000000..8d8dcb5
--- /dev/null
@@ -0,0 +1,30 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/node.hpp>
+#include <ngraph/op/op.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace ngraph {
+namespace op {
+
+class StaticShapeNonZero : public Op {
+public:
+    static constexpr NodeTypeInfo type_info{"StaticShapeNonZero", 1};
+    const NodeTypeInfo& get_type_info() const override { return type_info; }
+
+    explicit StaticShapeNonZero(const Output<ngraph::Node>& input);
+
+    void validate_and_infer_types() override;
+
+    std::shared_ptr<Node> copy_with_new_args(const NodeVector& new_args) const override;
+
+    bool visit_attributes(ngraph::AttributeVisitor& visitor) override;
+};
+}  // namespace op
+}  // namespace ngraph
diff --git a/inference-engine/src/vpu/common/include/vpu/ngraph/transformations/dynamic_to_static_shape.hpp b/inference-engine/src/vpu/common/include/vpu/ngraph/transformations/dynamic_to_static_shape.hpp
new file mode 100644 (file)
index 0000000..855cbcf
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+#include <vector>
+#include <memory>
+
+namespace ngraph {
+namespace pass {
+
+class DynamicToStaticShape : public FunctionPass {
+public:
+    DynamicToStaticShape() = default;
+
+    bool run_on_function(std::shared_ptr<ngraph::Function> function) override;
+
+private:
+    bool validateStaticShapes(std::shared_ptr<ngraph::Function> function) const;
+};
+
+}  // namespace pass
+}  // namespace ngraph
diff --git a/inference-engine/src/vpu/common/include/vpu/ngraph/transformations/dynamic_to_static_shape_nonzero.hpp b/inference-engine/src/vpu/common/include/vpu/ngraph/transformations/dynamic_to_static_shape_nonzero.hpp
new file mode 100644 (file)
index 0000000..15a7f53
--- /dev/null
@@ -0,0 +1,21 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+
+#include <vector>
+#include <memory>
+
+namespace ngraph {
+namespace pass {
+
+class DynamicToStaticShapeNonZero : public GraphRewrite {
+public:
+    DynamicToStaticShapeNonZero();
+};
+
+}  // namespace pass
+}  // namespace ngraph
index 8179535..ff8c921 100644 (file)
@@ -36,19 +36,19 @@ void throwFormat(const char* fileName, int lineNumber, const char* messageFormat
 }  // namespace details
 
 #define VPU_THROW_FORMAT(...)                                                         \
-    vpu::details::throwFormat<details::VPUException>(__FILE__, __LINE__, __VA_ARGS__)
+    vpu::details::throwFormat<vpu::details::VPUException>(__FILE__, __LINE__, __VA_ARGS__)
 
 #define VPU_THROW_UNLESS(condition, ...)                                                       \
     do {                                                                                       \
         if (!(condition)) {                                                                    \
-            vpu::details::throwFormat<details::VPUException>(__FILE__, __LINE__, __VA_ARGS__); \
+            vpu::details::throwFormat<vpu::details::VPUException>(__FILE__, __LINE__, __VA_ARGS__); \
         }                                                                                      \
     } while (false)
 
 #define VPU_THROW_UNSUPPORTED_UNLESS(condition, ...)                                                        \
     do {                                                                                                    \
         if (!(condition)) {                                                                                 \
-            vpu::details::throwFormat<details::UnsupportedLayerException>(__FILE__, __LINE__, __VA_ARGS__); \
+            vpu::details::throwFormat<vpu::details::UnsupportedLayerException>(__FILE__, __LINE__, __VA_ARGS__); \
         }                                                                                                   \
     } while (false)
 
index 270966e..1564bab 100644 (file)
@@ -25,7 +25,7 @@ VPU_DECLARE_ENUM(LayoutPreference,
 InferenceEngine::Layout deviceLayout(InferenceEngine::Layout const& layout,
                                      LayoutPreference const& layoutPreference);
 
-ie::Blob::Ptr getBlobFP16(const ie::Blob::Ptr& in);
+ie::Blob::Ptr convertBlobFP32toFP16(const ie::Blob::CPtr& in);
 
 ie::Blob::Ptr copyBlob(const ie::Blob::Ptr& original);
 ie::Blob::Ptr copyBlob(const ie::Blob::Ptr& in, ie::Layout outLayout, void* ptr = nullptr);
diff --git a/inference-engine/src/vpu/common/src/ngraph/operations/dynamic_shape_resolver.cpp b/inference-engine/src/vpu/common/src/ngraph/operations/dynamic_shape_resolver.cpp
new file mode 100644 (file)
index 0000000..ebe0a62
--- /dev/null
@@ -0,0 +1,43 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/operations/dynamic_shape_resolver.hpp"
+
+namespace ngraph { namespace op {
+
+constexpr NodeTypeInfo DynamicShapeResolver::type_info;
+
+DynamicShapeResolver::DynamicShapeResolver(const Output<Node>& tensorWithData, const Output<Node>& tensorWithDims)
+    : Op(OutputVector{tensorWithData, tensorWithDims}) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> DynamicShapeResolver::copy_with_new_args(const NodeVector& new_args) const {
+    check_new_args_count(this, new_args);
+    return std::make_shared<DynamicShapeResolver>(new_args.at(0), new_args.at(1));
+}
+
+void DynamicShapeResolver::validate_and_infer_types() {
+    NODE_VALIDATION_CHECK(this, get_input_size() == 2, "(", get_friendly_name(), ") supports only ", 2, " inputs, but ", get_input_size(), " provided");
+    NODE_VALIDATION_CHECK(this, get_input_partial_shape(0).is_static(), "(", get_friendly_name(), ") does not support dynamic shape for data tensor");
+    NODE_VALIDATION_CHECK(this, get_input_partial_shape(1).is_static(), "(", get_friendly_name(), ") does not support dynamic shape for dims tensor");
+
+    const auto& dimsElementType = get_input_element_type(1);
+    NODE_VALIDATION_CHECK(this, dimsElementType.is_integral_number() && dimsElementType.is_static(), "(", get_friendly_name(), ") supports only integral "
+        "number type for dims tensor, but ", dimsElementType, " provided");
+
+    const auto& dataShape = get_input_shape(0);
+    const auto& dimsShape = get_input_shape(1);
+    NODE_VALIDATION_CHECK(this, dimsShape.size() == 1 && dimsShape.front() == dataShape.size(), "(", get_friendly_name(), ") inputs shapes mismatch: first "
+        "input shape = ", dataShape, " second input shape = ", dimsShape, " but ", dataShape, " and ", Shape{dataShape.size()}, " are expected");
+
+    set_output_type(0, get_input_element_type(0), get_input_shape(0));
+}
+
+bool DynamicShapeResolver::visit_attributes(ngraph::AttributeVisitor& visitor) {
+    return true;
+}
+
+}  // namespace op
+}  // namespace ngraph
diff --git a/inference-engine/src/vpu/common/src/ngraph/operations/static_shape_nonzero.cpp b/inference-engine/src/vpu/common/src/ngraph/operations/static_shape_nonzero.cpp
new file mode 100644 (file)
index 0000000..3400e5b
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/operations/static_shape_nonzero.hpp"
+
+namespace ngraph {
+namespace op {
+
+constexpr NodeTypeInfo StaticShapeNonZero::type_info;
+
+StaticShapeNonZero::StaticShapeNonZero(const Output<Node>& input)
+        : Op({input}) {
+    constructor_validate_and_infer_types();
+}
+
+void StaticShapeNonZero::validate_and_infer_types() {
+    NODE_VALIDATION_CHECK(this, get_input_size() == 1,
+                          "StaticShapeNonZero must have only 1 input, provided: ",
+                          get_input_size());
+
+    const auto& arg_shape = get_input_partial_shape(0);
+    NODE_VALIDATION_CHECK(this, arg_shape.is_static(),
+                          "StaticShapeNonZero doesn't support dynamic input shape");
+
+    const auto& input_et = get_input_element_type(0);
+    NODE_VALIDATION_CHECK(this,
+                          input_et.is_integral_number() || input_et.is_real(),
+                          "StaticShapeNonZero input data type needs to be a numeric type. Got: ",
+                          input_et);
+
+    const auto total_dim_size = Dimension(shape_size(arg_shape.to_shape()));
+    set_output_type(0, element::i64, {arg_shape.rank(), total_dim_size});
+    set_output_type(1, element::i64, {Dimension(2)});
+}
+
+std::shared_ptr<Node> StaticShapeNonZero::copy_with_new_args(
+        const NodeVector& new_args) const {
+    check_new_args_count(this, new_args);
+    return std::make_shared<StaticShapeNonZero>(new_args.at(0));
+}
+
+bool StaticShapeNonZero::visit_attributes(ngraph::AttributeVisitor& visitor) {
+    return true;
+}
+
+}  // namespace op
+}  // namespace ngraph
diff --git a/inference-engine/src/vpu/common/src/ngraph/transformations/dynamic_to_static_shape.cpp b/inference-engine/src/vpu/common/src/ngraph/transformations/dynamic_to_static_shape.cpp
new file mode 100644 (file)
index 0000000..e7f92cf
--- /dev/null
@@ -0,0 +1,37 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/transformations/dynamic_to_static_shape.hpp"
+
+#include "vpu/ngraph/transformations/dynamic_to_static_shape_nonzero.hpp"
+
+#include <vpu/utils/error.hpp>
+
+namespace ngraph {
+namespace pass {
+
+bool DynamicToStaticShape::run_on_function(std::shared_ptr<ngraph::Function> function) {
+    DynamicToStaticShapeNonZero().run_on_function(function);
+
+    return validateStaticShapes(function);
+}
+
+bool DynamicToStaticShape::validateStaticShapes(std::shared_ptr<ngraph::Function> function) const {
+    function->validate_nodes_and_infer_types();
+
+    for (const auto& node : function->get_ops()) {
+        for (const auto& output : node->get_outputs()) {
+            const auto outputPartialShape = output.get_partial_shape();
+            VPU_THROW_UNLESS(outputPartialShape.is_static(),
+                             "DynamicToStaticShape pass: after all the transformations there is "
+                             "still dynamism in the network. First met node with dynamic output: "
+                             "%s (type: %s)", node->get_friendly_name(), node->get_type_name());
+            return false;
+        }
+    }
+    return true;
+}
+
+}  // namespace pass
+}  // namespace ngraph
diff --git a/inference-engine/src/vpu/common/src/ngraph/transformations/dynamic_to_static_shape_nonzero.cpp b/inference-engine/src/vpu/common/src/ngraph/transformations/dynamic_to_static_shape_nonzero.cpp
new file mode 100644 (file)
index 0000000..604dd90
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/transformations/dynamic_to_static_shape_nonzero.hpp"
+
+#include <vpu/ngraph/operations/static_shape_nonzero.hpp>
+#include <vpu/ngraph/operations/dynamic_shape_resolver.hpp>
+
+#include <ngraph/opsets/opset3.hpp>
+
+#include <memory>
+
+namespace ngraph {
+namespace pass {
+
+DynamicToStaticShapeNonZero::DynamicToStaticShapeNonZero() {
+    // We don't set strict_mode when use pattern Matcher,
+    // so we can set any type and shape for input.
+    auto inputWithAnyTypeAndShape = std::make_shared<pattern::op::Label>(
+            element::dynamic, PartialShape{});
+    auto nonZeroPattern = std::make_shared<ngraph::op::NonZero>(inputWithAnyTypeAndShape);
+
+    ngraph::graph_rewrite_callback callback = [](pattern::Matcher& matcher) {
+        const auto nonZero = std::dynamic_pointer_cast<ngraph::opset3::NonZero>(matcher.get_match_root());
+        if (!nonZero) {
+            return false;
+        }
+
+        auto staticShapeNonZero = std::make_shared<ngraph::op::StaticShapeNonZero>(
+                nonZero->input(0).get_source_output());
+        staticShapeNonZero->set_friendly_name(nonZero->get_friendly_name() + "/static_shape");
+
+        auto dynamicShapeResolver = std::make_shared<ngraph::op::DynamicShapeResolver>(
+                staticShapeNonZero->output(0), staticShapeNonZero->output(1));
+        dynamicShapeResolver->set_friendly_name(nonZero->get_friendly_name() + "/resolve_shape");
+
+        ngraph::replace_node(matcher.get_match_root(), dynamicShapeResolver);
+        return true;
+    };
+
+    const auto matcher = std::make_shared<ngraph::pattern::Matcher>(
+            nonZeroPattern, "DynamicToStaticShapeNonZero");
+    this->add_matcher(matcher, callback, PassProperty::CHANGE_DYNAMIC_STATE);
+}
+
+}  // namespace pass
+}  // namespace ngraph
index 3bc19c0..dc46aea 100644 (file)
@@ -40,16 +40,13 @@ InferenceEngine::Layout deviceLayout(InferenceEngine::Layout const& layout,
     return layout;
 }
 
-ie::Blob::Ptr getBlobFP16(const ie::Blob::Ptr& in) {
-    IE_PROFILING_AUTO_SCOPE(getBlobFP16);
+ie::Blob::Ptr convertBlobFP32toFP16(const ie::Blob::CPtr& in) {
+    IE_PROFILING_AUTO_SCOPE(convertBlobFP32toFP16);
 
     auto inDesc = in->getTensorDesc();
 
     auto precision = inDesc.getPrecision();
 
-    if (precision == ie::Precision::FP16)
-        return in;
-
     if (precision != ie::Precision::FP32) {
         VPU_THROW_EXCEPTION << "Unsupported precision " << precision.name();
     }
index 96df041..35379a1 100644 (file)
@@ -1,6 +1,6 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state
+// Define if runtime supports it. MX runtime is compatible
 #define USE_MANUAL_DMA 1
 
 #if defined (USE_MANUAL_DMA)
index ab595bb..16b4dc9 100644 (file)
@@ -1,6 +1,6 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-// Define if runtime supports it. MX runtime is compatible, KMB is in WIP state
+// Define if runtime supports it. MX runtime is compatible
 #define USE_MANUAL_DMA 1
 
 // Set to 1 if only output is zerroed before kernel execution
index 87654a8..371504a 100644 (file)
@@ -4,17 +4,18 @@
 
 #pragma once
 
-#include <memory>
+#include <vpu/graph_transformer.hpp>
+
+#include <vpu/model/model.hpp>
+#include <vpu/backend/blob_format.hpp>
+#include <ie_layers.h>
+
 #include <string>
+#include <memory>
 #include <set>
 #include <vector>
 #include <utility>
 
-#include <ie_layers.h>
-
-#include <vpu/graph_transformer.hpp>
-#include <vpu/model/model.hpp>
-
 namespace vpu {
 
 namespace ie = InferenceEngine;
@@ -38,6 +39,23 @@ private:
             std::pair<char*, size_t>& blobHeader,
             int& numActiveStages);
 
+    int serializeIOInfoSection(
+            const Model& model,
+            DataUsage dataUsage,
+            BlobSerializer& blobSerializer);
+
+    void serializeConstData(
+            const Model& model,
+            const mv_blob_header& blobHdr,
+            std::vector<char>& blob);
+
+    void serializeConstShapes(
+            const Model& model,
+            const mv_blob_header& blobHdr,
+            std::vector<char>& blob);
+
+    ElfN_Ehdr createElfHeader();
+
     void getMetaData(
             const Model& model,
             const std::vector<ie::CNNLayerPtr>& allLayers,
index 90ba3af..cc3847a 100644 (file)
@@ -146,6 +146,10 @@ public:
     void parseOneHot(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
     void parseExpPriorGridGenerator(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
     void parseExpGenerateProposals(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
+    void parseScatterUpdate(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
+    void parseExpTopKROIs(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
+    void parseNonZero(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
+    void parseROIAlign(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const;
 
     //
     // Special layers
index 6b1919f..d3449cf 100644 (file)
@@ -184,7 +184,7 @@ std::set<std::string> getSupportedLayers(
 //
 
 const uint32_t BLOB_MAGIC_NUMBER  = 9709;
-const uint32_t BLOB_VERSION_MAJOR = 5;
+const uint32_t BLOB_VERSION_MAJOR = 6;
 const uint32_t BLOB_VERSION_MINOR = 0;
 
 }  // namespace vpu
index d6006f8..66c44d1 100644 (file)
@@ -77,6 +77,7 @@ public:
      * Allocates memory for single data node
      */
     bool allocateData(const Data& data);
+    ShapeLocation allocateConstShape(Data& data);
     void freeData(const Data& data, DeallocationMode mode = DeallocationMode::JustFree);
 
     void selfCheck();
index 8954e63..63e3e6a 100644 (file)
@@ -175,6 +175,39 @@ struct HwConvTileInfo final {
     double cost = std::numeric_limits<double>::max();
 };
 
+//
+// Structs for split
+//
+
+struct Slice {
+    int start;
+    size_t size;
+
+    Slice(int start, size_t size) :
+            start(start),
+            size(size) {}
+};
+
+struct DataSlice {
+    Data data;
+    Slice slice;
+
+    DataSlice(Data data, Slice slice) :
+            data(std::move(data)),
+            slice(slice) {}
+};
+
+using DataSlices = std::vector<DataSlice>;
+
+struct ConvTileSlice {
+    HwConvTileInfo tile;
+    Slice slice;
+
+    ConvTileSlice(HwConvTileInfo tile, Slice slice) :
+            tile(tile),
+            slice(slice) {}
+};
+
 void printTo(std::ostream& os, const HwConvTileInfo& convTiles);
 void printTo(DotLabel& lbl, const HwConvTileInfo& convTiles);
 
index 045588c..529587b 100644 (file)
@@ -131,28 +131,6 @@ HwPaddingInfo getHwPaddingInfo(
 void printTo(std::ostream& os, const HwPaddingInfo& hwPad);
 void printTo(DotLabel& lbl, const HwPaddingInfo& hwPad);
 
-
-//
-// HwWeightsContent
-//
-
-class HwWeightsContent final : public CalculatedDataContent {
-public:
-    HwWeightsContent(
-            const DataContent::Ptr& origContent,
-            const DataDesc& origWeightsDesc,
-            int numInputChannels,
-            int channelStartIndex = 0);
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override;
-
-private:
-    DataDesc _origWeightsDesc;
-    int _numInputChannels = 0;
-    int _channelStartIndex = 0;
-};
-
 //
 // calculateHwBufferSize
 //
index 023c81e..3965a8b 100644 (file)
@@ -83,18 +83,6 @@ void deconv_to_conv(const T* src, T* dst, const DataDesc& desc) {
 }
 
 //
-// DefaultSwWeightsContent
-//
-
-class DefaultSwWeightsContent final : public CalculatedDataContent {
-public:
-    explicit DefaultSwWeightsContent(const DataContent::Ptr& origContent);
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override;
-};
-
-//
 // getOneOfSingleNextStage
 //
 
index ece7c1a..e4d656f 100644 (file)
@@ -4,21 +4,22 @@
 
 #pragma once
 
-#include <memory>
-#include <string>
-#include <functional>
-#include <vector>
-
-#include <ie_data.h>
-#include <ie_blob.h>
-
 #include <vpu/model/base.hpp>
 #include <vpu/model/edges.hpp>
 #include <vpu/model/data_desc.hpp>
+#include <vpu/model/data_contents/data_content.hpp>
 #include <vpu/backend/blob_serializer.hpp>
 #include <vpu/utils/enums.hpp>
 #include <vpu/utils/func_ref.hpp>
 
+#include <ie_data.h>
+#include <ie_blob.h>
+
+#include <memory>
+#include <string>
+#include <functional>
+#include <vector>
+
 namespace vpu {
 
 namespace ie = InferenceEngine;
@@ -46,15 +47,15 @@ VPU_DECLARE_ENUM(DataUsage,
 )
 
 //
-// DataLocation
+// Location
 //
 
 //
-// Describes where Data object is located.
+// Describes where particular data or shape is located.
 //
 
 // Must be synchronized with MvTensor
-VPU_DECLARE_ENUM(DataLocation,
+VPU_DECLARE_ENUM(Location,
     None = 0,
     Input = 1,
     Output = 2,
@@ -67,75 +68,25 @@ VPU_DECLARE_ENUM(MemoryType,
     DDR,
     CMX)
 
-//
-// DataContent
-//
-
-//
-// Content of the Const Data object.
-//
-
-class DataContent {
-public:
-    using Ptr = std::shared_ptr<DataContent>;
-
-    virtual ~DataContent();
-
-    // TYPED pointer
-    template <typename T>
-    const T* get() const {
-        return static_cast<const T*>(getRaw());
-    }
-
-    const DataDesc& desc() const {
-        return _desc;
-    }
-
-private:
-    // RAW pointer
-    virtual const void* getRaw() const = 0;
-
-private:
-    DataDesc _desc;
-
-    friend ModelObj;
+struct DataLocation final {
+    Location location;
+    int offset;
 };
 
-//
-// Data content that is calculated on the fly, using lazy calculation:
-//
-//   * It performs calculation on the first call and stores it in internal buffer.
-//   * Next access will return the pointer to calculated buffer.
-//
-class CalculatedDataContent : public DataContent {
-public:
-    CalculatedDataContent() = default;
-    explicit CalculatedDataContent(const SmallVector<DataContent::Ptr, 2>& baseContents) : _baseContents(baseContents) {}
-
-private:
-    const void* getRaw() const override;
-
-    virtual size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>& baseContents) const;
-    virtual void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const = 0;
-
-private:
-    mutable SmallVector<DataContent::Ptr, 2> _baseContents;
-    mutable std::vector<uint8_t> _temp;
+static constexpr DataLocation defaultDataLocation = {
+    Location::None, 0
 };
 
-DataContent::Ptr ieBlobContent(
-        const ie::Blob::Ptr& blob,
-        int repeat = 1);
-
-DataContent::Ptr replicateContent(float val, int count);
-DataContent::Ptr replicateContent(const DataContent::Ptr& origContent, int count);
-
-DataContent::Ptr scaleContent(const DataContent::Ptr& origContent, float scale);
+struct ShapeLocation final {
+    Location dimsLocation;
+    int dimsOffset;
+    Location stridesLocation;
+    int stridesOffset;
+};
 
-// The function scales the major dimension of 4D origContent
-DataContent::Ptr scaledChannelContent(
-        const DataContent::Ptr& origContent,
-        const DataContent::Ptr& scaleContent);
+static constexpr ShapeLocation defaultShapeLocation = {
+        Location::None, 0, Location::None, 0
+};
 
 //
 // DataNode
@@ -189,8 +140,8 @@ class DataNode final :
     //
 
     VPU_MODEL_ATTRIBUTE(MemoryType, memReqs, MemoryType::DDR)
-    VPU_MODEL_ATTRIBUTE(DataLocation, location, DataLocation::None)
-    VPU_MODEL_ATTRIBUTE(int, memoryOffset, 0)
+    VPU_MODEL_ATTRIBUTE(DataLocation, dataLocation, defaultDataLocation)
+    VPU_MODEL_ATTRIBUTE(ShapeLocation, shapeLocation, defaultShapeLocation)
 
     //
     // Edges wrappers
@@ -282,19 +233,18 @@ public:
 
     void setMemReqs(MemoryType mem);
 
-    void setIOInfo(DataLocation location, int ioBufferOffset);
+    void setIOInfo(Location location, int ioBufferOffset);
 
-    void setAllocationInfo(DataLocation location, int memoryOffset);
+    void setDataAllocationInfo(const DataLocation& dataLocation);
+
+    void setShapeAllocationInfo(const ShapeLocation& shapeLocation);
 
     //
     // Backend utilities
     //
 
     // Serialize as-is for new MvTensor kernels that can work with ND data.
-    // If `newOrder` is not empty, it will be used instead of original and missing dimensions will be set to 1.
-    void serializeBuffer(
-            BlobSerializer& serializer,
-            DimsOrder newOrder = DimsOrder());
+    void serializeBuffer(BlobSerializer& serializer);
 
     void serializeIOInfo(BlobSerializer& serializer) const;
 
@@ -304,11 +254,6 @@ private:
             const DataDesc& storedDesc,
             const DimValues& storedStrides) const;
 
-    void serializeBufferImpl(
-            BlobSerializer& serializer,
-            const DataDesc& storedDesc,
-            const DimValues& storedStrides) const;
-
 private:
     inline DataNode() :
         _consumerEdges(&StageInputEdge::_posInData),
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/batch_norm_contents.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/batch_norm_contents.hpp
new file mode 100644 (file)
index 0000000..f5a474e
--- /dev/null
@@ -0,0 +1,47 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+//
+// BatchNormalizationWeightsContent
+//
+
+class BatchNormalizationWeightsContent final : public CalculatedDataContent {
+public:
+    BatchNormalizationWeightsContent(const DataContent::Ptr& origContent, float epsilon);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void* tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    float _epsilon;
+};
+
+//
+// BatchNormalizationBiasesContent
+//
+
+class BatchNormalizationBiasesContent final : public CalculatedDataContent {
+public:
+    BatchNormalizationBiasesContent(const DataContent::Ptr& origContent, const DataContent::Ptr& weightsContent);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void* tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    DataContent::CPtr _weightsContent;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/calculated_data_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/calculated_data_content.hpp
new file mode 100644 (file)
index 0000000..4e6d89d
--- /dev/null
@@ -0,0 +1,34 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+#include <vpu/utils/small_vector.hpp>
+#include <vpu/model/data_desc.hpp>
+
+namespace vpu {
+
+//
+// Data content that is calculated on the fly, using lazy calculation:
+//
+//   * It performs calculation on the first call and stores it in internal buffer.
+//   * Next access will return the pointer to calculated buffer.
+//
+
+class CalculatedDataContent : public DataContent {
+public:
+    CalculatedDataContent() = default;
+
+private:
+    const void* getRaw() const override;
+
+    virtual void fillTempBuf(void *tempBuf) const = 0;
+
+private:
+    mutable std::vector<uint8_t> _temp;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/conv_weights_contents.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/conv_weights_contents.hpp
new file mode 100644 (file)
index 0000000..162f5ae
--- /dev/null
@@ -0,0 +1,65 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+//
+// ConvIm2ColWeightsContent
+//
+
+class ConvIm2ColWeightsContent final : public CalculatedDataContent {
+public:
+    explicit ConvIm2ColWeightsContent(const DataContent::Ptr& origContent, DataDesc desc);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void* tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    DataDesc _desc;
+};
+
+//
+// Conv3x3WeightsContent
+//
+
+class Conv3x3WeightsContent final : public CalculatedDataContent {
+public:
+    explicit Conv3x3WeightsContent(const DataContent::Ptr& origContent, DataDesc desc);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void* tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    DataDesc _desc;
+};
+
+//
+// ConvCHWWeightsContent
+//
+
+class ConvCHWWeightsContent final : public CalculatedDataContent {
+public:
+    explicit ConvCHWWeightsContent(const DataContent::Ptr& origContent, DataDesc desc);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void* tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    DataDesc _desc;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/data_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/data_content.hpp
new file mode 100644 (file)
index 0000000..ae2a74f
--- /dev/null
@@ -0,0 +1,34 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/utils/numeric.hpp>
+
+#include <details/ie_exception.hpp>
+
+#include <memory>
+#include <cstdint>
+
+namespace vpu {
+
+class DataContent {
+public:
+    using Ptr = std::shared_ptr<DataContent>;
+    using CPtr = std::shared_ptr<const DataContent>;
+
+    virtual ~DataContent();
+
+    template<typename T>
+    const T* get() const {
+        return static_cast<const T*>(getRaw());
+    }
+
+    virtual size_t byteSize() const = 0;
+
+private:
+    virtual const void* getRaw() const = 0;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/deconvolution_contents.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/deconvolution_contents.hpp
new file mode 100644 (file)
index 0000000..321cb2a
--- /dev/null
@@ -0,0 +1,100 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+//
+// DeconvolutionToConvolutionContent
+//
+
+class DeconvolutionToConvolutionContent final : public CalculatedDataContent {
+public:
+    DeconvolutionToConvolutionContent(const DataContent::Ptr& origContent, const DataDesc& desc);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    DataDesc _desc;
+};
+
+//
+// DepthDeconvolutionCHWWeightsContent
+//
+
+class DepthDeconvolutionCHWWeightsContent final : public CalculatedDataContent {
+public:
+    DepthDeconvolutionCHWWeightsContent(
+            const DataContent::Ptr& origContent,
+            int KX, int KY, int channels);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    int _KX;
+    int _KY;
+    int _channels;
+};
+
+//
+// DepthDeconvolutionHWCWeightsContent
+//
+
+class DepthDeconvolutionHWCWeightsContent final : public CalculatedDataContent {
+public:
+    DepthDeconvolutionHWCWeightsContent(
+            const DataContent::Ptr& origContent,
+            int KX, int KY, int channels);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    int _KX;
+    int _KY;
+    int _channels;
+};
+
+//
+// DeconvolutionWeightsContent
+//
+
+class DeconvolutionWeightsContent final : public CalculatedDataContent {
+public:
+    DeconvolutionWeightsContent(
+            const DataContent::Ptr& origContent,
+            DataDesc desc,
+            int KX, int KY,
+            int IC, int OC);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataDesc _desc;
+    DataContent::CPtr _origContent;
+    mutable std::vector<fp16_t> _intermBuf;
+    int _KX;
+    int _KY;
+    int _IC;
+    int _OC;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/default_sw_weights_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/default_sw_weights_content.hpp
new file mode 100644 (file)
index 0000000..5aee9b9
--- /dev/null
@@ -0,0 +1,25 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class DefaultSwWeightsContent final : public CalculatedDataContent {
+public:
+    DefaultSwWeightsContent(const DataContent::Ptr& origContent, const DataDesc& desc);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void* tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    DataDesc _desc;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/hw_const_data_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/hw_const_data_content.hpp
new file mode 100644 (file)
index 0000000..3452c61
--- /dev/null
@@ -0,0 +1,33 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+#include <vpu/middleend/hw/tiling.hpp>
+
+namespace vpu {
+
+class HwConstData final : public CalculatedDataContent {
+public:
+    HwConstData(
+            const DataContent::Ptr& origContent,
+            const DataDesc& origDesc,
+            const DataDesc& resDesc,
+            const std::map<Dim, Slice> dimSlices);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *outBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    DataDesc _origDesc;
+    DataDesc _resDesc;
+    std::map<Dim, Slice> _dimSlices;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/hw_weights_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/hw_weights_content.hpp
new file mode 100644 (file)
index 0000000..1f16c19
--- /dev/null
@@ -0,0 +1,33 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class HwWeightsContent final : public CalculatedDataContent {
+public:
+    HwWeightsContent(
+            const DataContent::Ptr& origContent,
+            const DataDesc& origWeightsDesc,
+            const DataDesc& resDesc,
+            int numInputChannels,
+            int channelStartIndex = 0);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    DataDesc _origDesc;
+    DataDesc _resDesc;
+    int _numInputChannels = 0;
+    int _channelStartIndex = 0;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/ie_blob_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/ie_blob_content.hpp
new file mode 100644 (file)
index 0000000..ea49e17
--- /dev/null
@@ -0,0 +1,30 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+#include <vpu/model/data.hpp>
+
+namespace vpu {
+
+class IeBlobContent final : public DataContent {
+public:
+    IeBlobContent(const ie::Blob::CPtr& blob, DataType resultDataType);
+
+    size_t byteSize() const override;
+
+protected:
+    const void* getRaw() const override;
+
+private:
+    DataType _resultDataType;
+    mutable ie::Blob::CPtr _blob;
+    mutable ie::Blob::CPtr _blobFp16;
+};
+
+DataContent::Ptr ieBlobContent(const ie::Blob::CPtr& blob, DataType resultPrecision = DataType::FP16);
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/kernel_binary_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/kernel_binary_content.hpp
new file mode 100644 (file)
index 0000000..b696f93
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+namespace vpu {
+
+class KernelBinaryContent final : public DataContent {
+public:
+    explicit KernelBinaryContent(const std::string& blob);
+
+    size_t byteSize() const override;
+
+protected:
+    const void* getRaw() const override;
+
+private:
+    std::string _blob;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/mean_contents.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/mean_contents.hpp
new file mode 100644 (file)
index 0000000..ded2dd1
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+#include <ie_preprocess.hpp>
+
+namespace vpu {
+
+//
+// MeanImageContent
+//
+
+class MeanImageContent final : public CalculatedDataContent {
+public:
+    MeanImageContent(const ie::PreProcessInfo& info, const DataDesc& desc);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataDesc _desc;
+    ie::PreProcessInfo _info;
+};
+
+//
+// MeanValueContent
+//
+
+class MeanValueContent final : public CalculatedDataContent {
+public:
+    explicit MeanValueContent(const ie::PreProcessInfo& info);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    ie::PreProcessInfo _info;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/merge_fc_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/merge_fc_content.hpp
new file mode 100644 (file)
index 0000000..7018a47
--- /dev/null
@@ -0,0 +1,28 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class MergeFullyConnectedContentsByChannels final : public CalculatedDataContent {
+public:
+    MergeFullyConnectedContentsByChannels(const std::vector<DataContent::CPtr> contents,
+                                          const std::vector<DataDesc> inDescs,
+                                          const DataDesc& resDesc);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *temp) const override;
+
+private:
+    std::vector<DataContent::CPtr> _contents;
+    std::vector<DataDesc> _inDescs;
+    DataDesc _resDesc;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/mtcnn_blob_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/mtcnn_blob_content.hpp
new file mode 100644 (file)
index 0000000..625aa58
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+namespace vpu {
+
+class MTCNNBlobContent final : public DataContent {
+public:
+    explicit MTCNNBlobContent(std::vector<char> blob);
+
+    size_t byteSize() const override;
+
+protected:
+    const void* getRaw() const override;
+
+private:
+    std::vector<char> _blob;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/prelu_blob_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/prelu_blob_content.hpp
new file mode 100644 (file)
index 0000000..4cc985e
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/data_content.hpp>
+#include <vpu/model/data_desc.hpp>
+
+#include <ie_blob.h>
+
+namespace vpu {
+
+class PReLUBlobContent final : public DataContent {
+public:
+    PReLUBlobContent(const InferenceEngine::Blob::CPtr& blob, const DataDesc& desc, int repeat);
+
+    size_t byteSize() const override;
+
+protected:
+    const void* getRaw() const override;
+
+private:
+    InferenceEngine::Blob::CPtr _blob;
+    int _repeat = 0;
+    DataDesc _desc;
+
+    mutable InferenceEngine::Blob::CPtr _blobFp16;
+    mutable std::vector<fp16_t> _tempFp16;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/priorbox_contents.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/priorbox_contents.hpp
new file mode 100644 (file)
index 0000000..110109c
--- /dev/null
@@ -0,0 +1,59 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+//
+// PriorBoxContent
+//
+
+class PriorBoxContent final : public CalculatedDataContent {
+public:
+    PriorBoxContent(
+            const DataDesc& inDesc0,
+            const DataDesc& inDesc1,
+            const DataDesc& outDesc,
+            const ie::CNNLayerPtr &layer);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataDesc _inDesc0;
+    DataDesc _inDesc1;
+    DataDesc _outDesc;
+    ie::CNNLayerPtr _layer;
+};
+
+//
+// PriorBoxClusteredContent
+//
+
+class PriorBoxClusteredContent final : public CalculatedDataContent {
+public:
+    PriorBoxClusteredContent(
+            const DataDesc& inDesc0,
+            const DataDesc& inDesc1,
+            const DataDesc& outDesc,
+            const ie::CNNLayerPtr& layer);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataDesc _inDesc0;
+    DataDesc _inDesc1;
+    DataDesc _outDesc;
+    ie::CNNLayerPtr _layer;
+};
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/replicated_data_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/replicated_data_content.hpp
new file mode 100644 (file)
index 0000000..a6ed7de
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class ReplicatedContent final : public CalculatedDataContent {
+public:
+    ReplicatedContent(float val, int count, const DataDesc& desc);
+
+    ReplicatedContent(DataContent::Ptr origContent, int count, const DataDesc& desc);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent = nullptr;
+    DataDesc _desc;
+    float _factor = 1.0f;
+    int _count = 0;
+};
+
+DataContent::Ptr replicateContent(float val, int count, const DataDesc& desc);
+DataContent::Ptr replicateContent(const DataContent::Ptr& origContent, int count, const DataDesc& desc);
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/scaled_content.hpp b/inference-engine/src/vpu/graph_transformer/include/vpu/model/data_contents/scaled_content.hpp
new file mode 100644 (file)
index 0000000..56cd314
--- /dev/null
@@ -0,0 +1,27 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+class ScaledContent final : public CalculatedDataContent {
+public:
+    ScaledContent(const DataContent::Ptr& origContent, float scale);
+
+    size_t byteSize() const override;
+
+protected:
+    void fillTempBuf(void *tempBuf) const override;
+
+private:
+    DataContent::CPtr _origContent;
+    float _factor = 1.0f;
+};
+
+DataContent::Ptr scaleContent(const DataContent::Ptr& origContent, float scale);
+
+} // namespace vpu
index 083c380..e0f17fd 100644 (file)
@@ -567,6 +567,8 @@ public:
 
     int totalDimSize() const;
 
+    int dimsByteSize() const { return numDims() * static_cast<int>(sizeof(int32_t)); }
+
     //
     // DimsOrder
     //
index 84786c1..206cd4e 100644 (file)
@@ -144,6 +144,7 @@ VPU_DECLARE_ENUM(StageType,
     Exp = 101,
     Floor = 102,
     TopK = 104,
+    ScatterUpdate = 103,
     ReduceMin = 105,
     ExpDetectionOutput = 106,  // ExperimentalDetectronDetectionOutput
     NonMaxSuppression = 107,
@@ -160,7 +161,10 @@ VPU_DECLARE_ENUM(StageType,
     LoopStart = 119,
     LoopEnd = 120,
     ExpPriorGridGenerator = 121,
+    NonZero = 122,
+    ROIAlign = 123,
     ExpGenerateProposals = 124,
+    ExpTopKROIs = 125,
 )
 
 //
index e87d841..b1a2156 100644 (file)
@@ -265,6 +265,16 @@ public:
             const DataVector& inputs,
             const Data& output);
 
+    Stage addScatterUpdateStage(
+            const Model& model,
+            const std::string& name,
+            const ie::CNNLayerPtr& layer,
+            const Data& input,
+            const Data& output,
+            const Data& indices,
+            const Data& updates,
+            const Data& axis);
+
     Stage addLoopStartStage(
         const Model& model,
         const std::string& name,
index 8e9912d..ec3311e 100644 (file)
@@ -129,15 +129,15 @@ void BackEnd::dumpModelToDot(
             } else if (data->usage() == DataUsage::Temp) {
                 dataColor = "cyan";
             } else if (data->usage() == DataUsage::Intermediate) {
-                if (data->location() == DataLocation::BSS) {
+                if (data->dataLocation().location == Location::BSS) {
                     dataColor = "cyan";
-                } else if (data->location() == DataLocation::CMX) {
+                } else if (data->dataLocation().location == Location::CMX) {
                     dataColor = "magenta";
-                } else if (data->location() == DataLocation::Blob) {
+                } else if (data->dataLocation().location == Location::Blob) {
                     dataColor = "aquamarine";
-                } else if (data->location() == DataLocation::Input) {
+                } else if (data->dataLocation().location == Location::Input) {
                     dataColor = "green";
-                } else if (data->location() == DataLocation::Output) {
+                } else if (data->dataLocation().location == Location::Output) {
                     dataColor = "deepskyblue";
                 }
             }
@@ -179,8 +179,8 @@ void BackEnd::dumpModelToDot(
                     }
                 }
                 lbl.appendPair("memReqs", data->memReqs());
-                lbl.appendPair("location", data->location());
-                lbl.appendPair("memoryOffset", data->memoryOffset());
+                lbl.appendPair("location", data->dataLocation().location);
+                lbl.appendPair("memoryOffset", data->dataLocation().offset);
                 if (!data->attrs().empty()) {
                     lbl.appendPair("extraAttrs", data->attrs());
                 }
index a6536ed..d98e6e0 100644 (file)
@@ -4,9 +4,21 @@
 
 #include <vpu/backend/backend.hpp>
 
+#include <vpu/parsed_config.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/dot_io.hpp>
+#include <vpu/utils/file_system.hpp>
+#include <vpu/utils/numeric.hpp>
+
+#include <precision_utils.h>
+#include <details/caseless.hpp>
+#include <graph_tools.hpp>
+#include <description_buffer.hpp>
+#include <xml_parse_utils.h>
+
 #include <climits>
 #include <cstring>
-
 #include <string>
 #include <memory>
 #include <list>
 #include <iomanip>
 #include <atomic>
 
-#include <precision_utils.h>
-#include <details/caseless.hpp>
-#include <graph_tools.hpp>
-#include <description_buffer.hpp>
-#include <xml_parse_utils.h>
-
-#include <vpu/parsed_config.hpp>
-#include <vpu/compile_env.hpp>
-#include <vpu/backend/blob_format.hpp>
-#include <vpu/utils/auto_scope.hpp>
-#include <vpu/utils/dot_io.hpp>
-#include <vpu/utils/file_system.hpp>
-#include <vpu/utils/numeric.hpp>
-
 namespace vpu {
 
-void BackEnd::serialize(
+struct ModelStagesStat final {
+    bool hasHwStage;
+    bool hasShaveStage;
+    bool hasDmaStage;
+};
+
+int BackEnd::serializeIOInfoSection(
         const Model& model,
-        std::vector<char>& blob,
-        std::pair<char*, size_t>& blobHeader,
-        int& numActiveStages) {
-    VPU_PROFILE(serialize);
+        DataUsage dataUsage,
+        BlobSerializer& blobSerializer) {
+    VPU_INTERNAL_CHECK(dataUsage == DataUsage::Input || dataUsage == DataUsage::Output,
+        "serializeIOInfoSection was called with {} usage while only {} and {} usages are supported",
+        dataUsage, DataUsage::Input, DataUsage::Output);
 
-    const auto& env = CompileEnv::get();
+    int datasNumber = 0;
+
+    for (const auto& data : model->datas()) {
+        if (data->usage() != dataUsage) {
+            continue;
+        }
 
-    auto batchSize = model->batchSize();
-    auto usedMemory = model->attrs().get<UsedMemory>("usedMemory");
+        if (dataUsage == DataUsage::Input) {
+            VPU_INTERNAL_CHECK(data->producerEdge() == nullptr,
+                "serializeIOInfoSection failed on input data {}. Input must have no producer but actually it has: {} with type {}",
+                data->name(), data->producerEdge()->producer()->name(), data->producerEdge()->producer()->type());
+            VPU_INTERNAL_CHECK(data->numConsumers() != 0,
+                "serializeIOInfoSection failed on input data {}. Input must have at least one consumer but it doesn't ",
+                data->usage());
+        }
 
-    //
-    // Remove special stages from the stages list
-    //
+        if (dataUsage == DataUsage::Output) {
+            VPU_INTERNAL_CHECK(data->producerEdge() != nullptr,
+                "serializeIOInfoSection failed on output data {}. Output must have any producer but it doesn't",
+                data->usage());
+        }
 
-    bool hasHwStage = false;
-    bool hasShaveStage = false;
-    bool hasDmaStage = false;
+        VPU_INTERNAL_CHECK(data->parentDataEdge() == nullptr,
+            "serializeIOInfoSection failed on {} with usage {}. IO data must have no parentDatas but it does");
 
-    StageVector execStages;
-    execStages.reserve(model->numStages());
+        VPU_INTERNAL_CHECK(!data->attrs().has("ioIdx"),
+            "serializeIOInfoSection failed: IO data {} with usage {} doesn't have ioIdx attribute",
+            data->name(), data->usage());
 
-    for (const auto& stage : model->getStages()) {
-        if (stage->category() == StageCategory::Special) {
-            continue;
-        }
+        data->attrs().set("ioIdx", datasNumber);
 
-        if (stage->category() == StageCategory::HW) {
-            hasHwStage = true;
-        } else if (stage->category() == StageCategory::SHAVE) {
-            hasShaveStage = true;
-        } else if (stage->category() == StageCategory::DMA) {
-            hasDmaStage = true;
-        }
+        data->serializeIOInfo(blobSerializer);
 
-        execStages.emplace_back(stage);
+        ++datasNumber;
     }
 
-    numActiveStages = execStages.size();
+    return datasNumber;
+}
 
-    //
-    // I/O info sections
-    //
+ElfN_Ehdr BackEnd::createElfHeader() {
+    ElfN_Ehdr elfHdr = {};
+    elfHdr.e_ident[0] = 0x7f;
+    elfHdr.e_ident[1] = 'e';
+    elfHdr.e_ident[2] = 'l';
+    elfHdr.e_ident[3] = 'f';
+    for (int i = 4; i < 16; i++) {
+        elfHdr.e_ident[i] = 0;
+    }
+    elfHdr.e_type = 1;
+    elfHdr.e_machine = 2;
+    elfHdr.e_version = 2;
+    elfHdr.e_entry = 0;
+    elfHdr.e_phoff = 0;
+    elfHdr.e_shoff = 0;
+    elfHdr.e_ehsize = 8 * sizeof(elfHdr);
 
-    int numInputs = 0;
-    BlobSerializer inputInfoSerializer;
+    return elfHdr;
+}
+
+void BackEnd::serializeConstData(const Model& model, const mv_blob_header& blobHdr, std::vector<char>& blob) {
     for (const auto& data : model->datas()) {
-        if (data->usage() != DataUsage::Input) {
+        if (data->usage() != DataUsage::Const) {
             continue;
         }
 
         IE_ASSERT(data->producerEdge() == nullptr);
         IE_ASSERT(data->parentDataEdge() == nullptr);
         IE_ASSERT(data->numConsumers() != 0);
+        IE_ASSERT(data->dataLocation().location == Location::Blob);
 
-        IE_ASSERT(!data->attrs().has("ioIdx"));
-        data->attrs().set("ioIdx", numInputs);
-
-        data->serializeIOInfo(inputInfoSerializer);
+        const auto content = data->content();
+        IE_ASSERT(content != nullptr);
 
-        ++numInputs;
+        std::copy_n(content->get<uint8_t>(), content->byteSize(), blob.data() + blobHdr.const_data_section_offset + data->dataLocation().offset);
     }
+}
 
-    int numOutputs = 0;
-    BlobSerializer outputInfoSerializer;
+void BackEnd::serializeConstShapes(const Model& model, const mv_blob_header& blobHdr, std::vector<char>& blob) {
     for (const auto& data : model->datas()) {
-        if (data->usage() != DataUsage::Output) {
-            continue;
-        }
+        const auto serializeToBlob = [&data, &blob, &blobHdr](const BlobSerializer& serializer, int offset) {
+            std::copy_n(serializer.data(), data->desc().numDims() * sizeof(uint32_t), blob.data() + blobHdr.const_data_section_offset + offset);
+        };
 
-        IE_ASSERT(data->producerEdge() != nullptr);
-        IE_ASSERT(data->parentDataEdge() == nullptr);
+        const auto dimsOrder = data->desc().dimsOrder();
+        const auto storedPerm = dimsOrder.toPermutation();
 
-        IE_ASSERT(!data->attrs().has("ioIdx"));
-        data->attrs().set("ioIdx", numOutputs);
+        const auto shapeLocation = data->shapeLocation();
 
-        data->serializeIOInfo(outputInfoSerializer);
+        if (shapeLocation.dimsLocation == Location::Blob) {
+            BlobSerializer dimsSerializer;
+            const auto dims = data->desc().dims();
 
-        ++numOutputs;
+            for (const auto& d : storedPerm) {
+                dimsSerializer.append(checked_cast<uint32_t>(dims[d]));
+            }
+            serializeToBlob(dimsSerializer, shapeLocation.dimsOffset);
+        }
+
+        if (shapeLocation.stridesLocation == Location::Blob) {
+            BlobSerializer stridesSerializer;
+            const auto strides = data->strides();
+
+            for (const auto& d : storedPerm) {
+                stridesSerializer.append(checked_cast<uint32_t>(strides[d]));
+            }
+            serializeToBlob(stridesSerializer, shapeLocation.stridesOffset);
+        }
     }
+}
 
-    //
-    // Stages section
-    //
+void BackEnd::serialize(
+        const Model& model,
+        std::vector<char>& blob,
+        std::pair<char*, size_t>& blobHeader,
+        int& numActiveStages) {
+    VPU_PROFILE(serialize);
 
+    const auto& env = CompileEnv::get();
 
+    BlobSerializer inputInfoSerializer;
+    BlobSerializer outputInfoSerializer;
     BlobSerializer stagesSerializer;
+
+    const auto getExecStages = [&model]() {
+        StageVector execStages;
+        execStages.reserve(model->numStages());
+
+        for (const auto& stage : model->getStages()) {
+            if (stage->category() == StageCategory::Special) {
+                continue;
+            }
+
+            execStages.emplace_back(stage);
+        }
+
+        return execStages;
+    };
+
+    const auto getModelStagesStat = [&model]() {
+        ModelStagesStat modelStagesStat{false, false, false};
+
+        for (const auto& stage : model->getStages()) {
+            if (stage->category() == StageCategory::Special) {
+                continue;
+            }
+
+            if (stage->category() == StageCategory::HW) {
+                modelStagesStat.hasHwStage = true;
+            } else if (stage->category() == StageCategory::SHAVE) {
+                modelStagesStat.hasShaveStage = true;
+            } else if (stage->category() == StageCategory::DMA) {
+                modelStagesStat.hasDmaStage = true;
+            }
+        }
+
+        return modelStagesStat;
+    };
+
+    const auto createBlobHeader = [&env, &model, &inputInfoSerializer, &outputInfoSerializer, &stagesSerializer]
+            (int numInputs, int numOutputs, const StageVector& execStages, const ModelStagesStat& modelStagesStat) {
+        const auto batchSize = model->batchSize();
+        const auto usedMemory = model->attrs().get<UsedMemory>("usedMemory");
+
+        const auto hdrSize = alignVal<int>(sizeof(ElfN_Ehdr) + sizeof(mv_blob_header), 64);
+        const auto inputInfoSecSize = alignVal(inputInfoSerializer.size(), 64);
+        const auto outputInfoSecSize = alignVal(outputInfoSerializer.size(), 64);
+        const auto stagesSecSize = alignVal(stagesSerializer.size(), 64);
+        const auto constDataSecSize = alignVal(usedMemory.blob, 64);
+
+        mv_blob_header blobHdr = {};
+        blobHdr.magic_number = BLOB_MAGIC_NUMBER;
+        blobHdr.file_size = checked_cast<uint32_t>(hdrSize + inputInfoSecSize + outputInfoSecSize + stagesSecSize + constDataSecSize);
+        blobHdr.blob_ver_major = BLOB_VERSION_MAJOR;
+        blobHdr.blob_ver_minor = BLOB_VERSION_MINOR;
+        blobHdr.inputs_count = checked_cast<uint32_t>(numInputs);
+        blobHdr.outputs_count = checked_cast<uint32_t>(numOutputs);
+        blobHdr.stages_count = checked_cast<uint32_t>(execStages.size());
+        blobHdr.inputs_size = checked_cast<uint32_t>(usedMemory.input);
+        blobHdr.outputs_size = checked_cast<uint32_t>(usedMemory.output);
+        blobHdr.batch_size = checked_cast<uint32_t>(batchSize);
+        blobHdr.bss_mem_size = checked_cast<uint32_t>(usedMemory.BSS);
+        blobHdr.number_of_cmx_slices = checked_cast<uint32_t>(env.resources.numCMXSlices);
+        blobHdr.number_of_shaves = checked_cast<uint32_t>(env.resources.numSHAVEs);
+        blobHdr.has_hw_stage = checked_cast<uint32_t>(modelStagesStat.hasHwStage);
+        blobHdr.has_shave_stage = checked_cast<uint32_t>(modelStagesStat.hasShaveStage);
+        blobHdr.has_dma_stage = checked_cast<uint32_t>(modelStagesStat.hasDmaStage);
+        blobHdr.input_info_section_offset = checked_cast<uint32_t>(hdrSize);
+        blobHdr.output_info_section_offset = checked_cast<uint32_t>(blobHdr.input_info_section_offset + inputInfoSecSize);
+        blobHdr.stage_section_offset = checked_cast<uint32_t>(blobHdr.output_info_section_offset + outputInfoSecSize);
+        blobHdr.const_data_section_offset = checked_cast<uint32_t>(blobHdr.stage_section_offset + stagesSecSize);
+
+        return blobHdr;
+    };
+
+    const int numInputs = serializeIOInfoSection(model, DataUsage::Input, inputInfoSerializer);
+    const int numOutputs = serializeIOInfoSection(model, DataUsage::Output, outputInfoSerializer);
+
+    const auto& execStages = getExecStages();
+    numActiveStages = checked_cast<int>(execStages.size());
+
     for (const auto& stage : execStages) {
         stage->serialize(stagesSerializer);
     }
 
-    //
-    // Elf header
-    //
+    const auto modelStagesStat = getModelStagesStat();
 
-    ElfN_Ehdr elfHdr = {};
-    elfHdr.e_ident[0] = 0x7f;
-    elfHdr.e_ident[1] = 'e';
-    elfHdr.e_ident[2] = 'l';
-    elfHdr.e_ident[3] = 'f';
-    for (int i = 4; i < 16; i++) {
-        elfHdr.e_ident[i] = 0;
-    }
-    elfHdr.e_type = 1;
-    elfHdr.e_machine = 2;
-    elfHdr.e_version = 2;
-    elfHdr.e_entry = 0;
-    elfHdr.e_phoff = 0;
-    elfHdr.e_shoff = 0;
-    elfHdr.e_ehsize = 8 * sizeof(elfHdr);
-
-    //
-    // Blob header
-    //
-
-    auto hdrSize = alignVal<int>(sizeof(ElfN_Ehdr) + sizeof(mv_blob_header), 64);
-    auto inputInfoSecSize = alignVal(inputInfoSerializer.size(), 64);
-    auto outputInfoSecSize = alignVal(outputInfoSerializer.size(), 64);
-    auto stagesSecSize = alignVal(stagesSerializer.size(), 64);
-    auto constDataSecSize = alignVal(usedMemory.blob, 64);
-
-    mv_blob_header blobHdr = {};
-    blobHdr.magic_number = BLOB_MAGIC_NUMBER;
-    blobHdr.file_size = checked_cast<uint32_t>(hdrSize + inputInfoSecSize + outputInfoSecSize + stagesSecSize + constDataSecSize);
-    blobHdr.blob_ver_major = BLOB_VERSION_MAJOR;
-    blobHdr.blob_ver_minor = BLOB_VERSION_MINOR;
-    blobHdr.inputs_count = checked_cast<uint32_t>(numInputs);
-    blobHdr.outputs_count = checked_cast<uint32_t>(numOutputs);
-    blobHdr.stages_count = checked_cast<uint32_t>(execStages.size());
-    blobHdr.inputs_size = checked_cast<uint32_t>(usedMemory.input);
-    blobHdr.outputs_size = checked_cast<uint32_t>(usedMemory.output);
-    blobHdr.batch_size = checked_cast<uint32_t>(batchSize);
-    blobHdr.bss_mem_size = checked_cast<uint32_t>(usedMemory.BSS);
-    blobHdr.number_of_cmx_slices = checked_cast<uint32_t>(env.resources.numCMXSlices);
-    blobHdr.number_of_shaves = checked_cast<uint32_t>(env.resources.numSHAVEs);
-    blobHdr.has_hw_stage = checked_cast<uint32_t>(hasHwStage);
-    blobHdr.has_shave_stage = checked_cast<uint32_t>(hasShaveStage);
-    blobHdr.has_dma_stage = checked_cast<uint32_t>(hasDmaStage);
-    blobHdr.input_info_section_offset = checked_cast<uint32_t>(hdrSize);
-    blobHdr.output_info_section_offset = checked_cast<uint32_t>(blobHdr.input_info_section_offset + inputInfoSecSize);
-    blobHdr.stage_section_offset = checked_cast<uint32_t>(blobHdr.output_info_section_offset + outputInfoSecSize);
-    blobHdr.const_data_section_offset = checked_cast<uint32_t>(blobHdr.stage_section_offset + stagesSecSize);
-
-    //
-    // Generate fathom blob
-    //
+    const auto elfHdr = createElfHeader();
+    const auto blobHdr = createBlobHeader(numInputs, numOutputs, execStages, modelStagesStat);
 
     blob.clear();
     blob.resize(blobHdr.file_size, 0);
@@ -198,25 +271,8 @@ void BackEnd::serialize(
     std::copy_n(outputInfoSerializer.data(), outputInfoSerializer.size(), blob.data() + blobHdr.output_info_section_offset);
     std::copy_n(stagesSerializer.data(), stagesSerializer.size(), blob.data() + blobHdr.stage_section_offset);
 
-    for (const auto& data : model->datas()) {
-        if (data->usage() != DataUsage::Const) {
-            continue;
-        }
-
-        IE_ASSERT(data->producerEdge() == nullptr);
-        IE_ASSERT(data->parentDataEdge() == nullptr);
-        IE_ASSERT(data->numConsumers() != 0);
-        IE_ASSERT(data->location() == DataLocation::Blob);
-
-        auto content = data->content();
-        IE_ASSERT(content != nullptr);
-
-        std::copy_n(content->get<uint8_t>(), data->totalByteSize(), blob.data() + blobHdr.const_data_section_offset + data->memoryOffset());
-    }
-
-    //
-    // Blob header spec begin containing elf header and blobHeader
-    //
+    serializeConstData(model, blobHdr, blob);
+    serializeConstShapes(model, blobHdr, blob);
 
     blobHeader.first = blob.data();
     blobHeader.second = sizeof(ElfN_Ehdr) + sizeof(mv_blob_header);
index ab2546c..80b34c8 100644 (file)
@@ -65,7 +65,7 @@ void BlobReader::parse(const std::vector<char>& blob) {
         // Truncate zeros
         inputName = inputName.c_str();
 
-        auto dataType = static_cast<DataType>(readFromBlob<uint32_t>(blob, inputInfoSecOffset));
+        auto dataType = readFromBlob<DataType>(blob, inputInfoSecOffset);
         auto orderCode = readFromBlob<uint32_t>(blob, inputInfoSecOffset);
 
         auto numDims = readFromBlob<uint32_t>(blob, inputInfoSecOffset);
@@ -74,14 +74,21 @@ void BlobReader::parse(const std::vector<char>& blob) {
         auto perm = dimsOrder.toPermutation();
         IE_ASSERT(perm.size() == numDims);
 
+        auto dimsLocation = readFromBlob<Location>(blob, inputInfoSecOffset);
+        VPU_THROW_UNLESS(dimsLocation == Location::Blob,
+            "BlobReader error while parsing {} input data: only Blob location for input shape is supported, but {} was given",
+            inputName, dimsLocation);
+        auto dimsOffset = _blobHeader.const_data_section_offset + readFromBlob<uint32_t>(blob, inputInfoSecOffset);
+
+        // Skip strides' location and offset
+        inputInfoSecOffset += 2 * sizeof(uint32_t);
+
         DimValues vpuDims;
+
         for (int i = 0; i < perm.size(); ++i) {
-            vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, inputInfoSecOffset));
+            vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, dimsOffset));
         }
 
-        // Skip strides
-        inputInfoSecOffset += perm.size() * sizeof(uint32_t);
-
         ie::TensorDesc ieDesc = DataDesc(dataType, dimsOrder, vpuDims).toTensorDesc();
         ie::Data inputData(inputName, ieDesc);
 
@@ -108,7 +115,7 @@ void BlobReader::parse(const std::vector<char>& blob) {
         // Truncate zeros
         outputName = outputName.c_str();
 
-        auto dataType = static_cast<DataType>(readFromBlob<uint32_t>(blob, outputInfoSecOffset));
+        auto dataType = readFromBlob<DataType>(blob, outputInfoSecOffset);
         auto orderCode = readFromBlob<uint32_t>(blob, outputInfoSecOffset);
 
         auto numDims = readFromBlob<uint32_t>(blob, outputInfoSecOffset);
@@ -117,14 +124,21 @@ void BlobReader::parse(const std::vector<char>& blob) {
         auto perm = dimsOrder.toPermutation();
         IE_ASSERT(perm.size() == numDims);
 
+        auto dimsLocation = readFromBlob<Location>(blob, outputInfoSecOffset);
+        VPU_THROW_UNLESS(dimsLocation == Location::Blob,
+            "BlobReader error while parsing {} output data: only Blob location for output shape is supported, but {} was given",
+            outputName, dimsLocation);
+        auto dimsOffset = _blobHeader.const_data_section_offset + readFromBlob<uint32_t>(blob, outputInfoSecOffset);
+
+        // Skip strides' location and offset
+        outputInfoSecOffset += 2 * sizeof(uint32_t);
+
         DimValues vpuDims;
+
         for (int i = 0; i < perm.size(); ++i) {
-            vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, outputInfoSecOffset));
+            vpuDims.set(perm[i], readFromBlob<uint32_t>(blob, dimsOffset));
         }
 
-        // Skip strides
-        outputInfoSecOffset += perm.size() * sizeof(uint32_t);
-
         ie::TensorDesc ieDesc = DataDesc(dataType, dimsOrder, vpuDims).toTensorDesc();
         ie::Data outputData(outputName, ieDesc);
 
index 4261bf4..c00fc96 100644 (file)
@@ -13,7 +13,6 @@
 #include <details/caseless.hpp>
 #include <details/ie_cnn_network_iterator.hpp>
 #include <cpp/ie_cnn_network.h>
-#include <cnn_network_ngraph_impl.hpp>
 #include <graph_tools.hpp>
 
 #include <ngraph/function.hpp>
@@ -39,7 +38,7 @@ void FrontEnd::detectNetworkBatch(
     auto checkForDeprecatedCnn = [&network, &env]() {
         return !network.getFunction()
                && !env.config.forceDeprecatedCnnConversion
-               && dynamic_cast<const ie::details::CNNNetworkNGraphImpl*>(&network);
+               && !dynamic_cast<const ie::details::CNNNetworkImpl*>(&network);
     };
     VPU_THROW_UNLESS(!checkForDeprecatedCnn(), "Unexpected CNNNetwork format: it was converted to deprecated format prior plugin's call");
 
index 8d2c822..03ede02 100644 (file)
@@ -5,6 +5,7 @@
 #include "vpu/frontend/frontend.hpp"
 #include "vpu/utils/profiling.hpp"
 #include "vpu/compile_env.hpp"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
 
 #include "net_pass.h"
 
@@ -98,6 +99,10 @@ FrontEnd::FrontEnd(StageBuilder::Ptr stageBuilder)
         {"OneHot",                                             LAYER_PARSER(parseOneHot)},
         {"ExperimentalDetectronPriorGridGenerator",            LAYER_PARSER(parseExpPriorGridGenerator)},
         {"ExperimentalDetectronGenerateProposalsSingleImage",  LAYER_PARSER(parseExpGenerateProposals)},
+        {"ScatterUpdate",                                      LAYER_PARSER(parseScatterUpdate)},
+        {"ExperimentalDetectronTopKROIs",                      LAYER_PARSER(parseExpTopKROIs)},
+        {"StaticShapeNonZero",                                 LAYER_PARSER(parseNonZero)},
+        {"ROIAlign",                                           LAYER_PARSER(parseROIAlign)},
     }} {}
 
 ModelPtr FrontEnd::buildInitialModel(ie::ICNNNetwork& network) {
index f419b73..16c14ac 100644 (file)
@@ -4,15 +4,16 @@
 
 #include <vpu/frontend/frontend.hpp>
 
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
 #include <memory>
 #include <algorithm>
 #include <set>
 #include <map>
 #include <string>
 
-#include <vpu/compile_env.hpp>
-#include <vpu/utils/ie_helpers.hpp>
-
 namespace vpu {
 
 void FrontEnd::parseInputAndOutputData(const Model& model) {
@@ -113,7 +114,7 @@ void FrontEnd::parseInputAndOutputData(const Model& model) {
         const auto vpuData = model->addConstData(
             ieData->getName(),
             descriptor,
-            ieBlobContent(ieBlob));
+            ieBlobContent(ieBlob, descriptor.type()));
 
         // User might ask to return the output from Const layer.
         if (const auto vpuOutData = getVpuData(ieData)) {
index 29fc10b..72185e3 100644 (file)
 
 #include <vpu/frontend/frontend.hpp>
 
-#include <vector>
-#include <memory>
-#include <string>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/model/data_contents/mean_contents.hpp>
 
 #include <details/caseless.hpp>
 #include <cpp/ie_cnn_network.h>
 #include <precision_utils.h>
 #include <ie_parallel.hpp>
 
-#include <vpu/middleend/sw/utility.hpp>
-#include <vpu/utils/ie_helpers.hpp>
-#include <vpu/compile_env.hpp>
+#include <vector>
+#include <memory>
+#include <string>
 
 namespace vpu {
 
-namespace {
-
-class MeanImageContent final : public CalculatedDataContent {
-public:
-    explicit MeanImageContent(const ie::PreProcessInfo& info) : _info(info) {
-    }
-
-protected:
-    size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
-        size_t countElem = checked_cast<size_t>(desc().dim(Dim::W) * desc().dim(Dim::H) * desc().dim(Dim::C));
-        if (desc().dimsOrder() == DimsOrder::NHWC || desc().dimsOrder() == DimsOrder::HWC) {
-            countElem *= 2;
-        }
-
-        return countElem * sizeof(fp16_t);
-    }
-
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
-        VPU_PROFILE(MeanImageContent);
-
-        const size_t numOfChannel = _info.getNumberOfChannels();
-
-        const size_t imagePixels = checked_cast<size_t>(desc().dim(Dim::W) * desc().dim(Dim::H));
-        const size_t countElem = checked_cast<size_t>(desc().dim(Dim::W) * desc().dim(Dim::H) * desc().dim(Dim::C));
-
-        const auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
-        auto dstPtr2 = dstPtr;
-        if (desc().dimsOrder() == DimsOrder::NHWC || desc().dimsOrder() == DimsOrder::HWC) {
-            dstPtr2 += countElem;
-        }
-
-        ie::parallel_for(numOfChannel, [=](size_t i) {
-            const auto meanDataBlob = _info[i]->meanData;
-
-            ie::PrecisionUtils::f32tof16Arrays(
-                dstPtr2 + i * imagePixels,
-                meanDataBlob->buffer().as<const float*>(),
-                imagePixels,
-                -1.0f);
-        });
-
-        if (desc().dimsOrder() == DimsOrder::NHWC || desc().dimsOrder() == DimsOrder::HWC) {
-            kchw_to_hwck(dstPtr2, dstPtr, desc());
-        }
-    }
-
-private:
-    ie::PreProcessInfo _info;
-};
-
-class MeanValueContent final : public CalculatedDataContent {
-public:
-    explicit MeanValueContent(const ie::PreProcessInfo& info) : _info(info) {
-    }
-
-protected:
-    size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
-        return _info.getNumberOfChannels() * sizeof(fp16_t);
-    }
-
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
-        VPU_PROFILE(MeanValueContent);
-
-        IE_ASSERT(checked_cast<size_t>(desc().totalDimSize()) == _info.getNumberOfChannels());
-
-        const auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
-        ie::parallel_for(_info.getNumberOfChannels(), [dstPtr, this](size_t i) {
-            dstPtr[i] = ie::PrecisionUtils::f32tof16(-_info[i]->meanValue);
-        });
-    }
-
-private:
-    ie::PreProcessInfo _info;
-};
-
-}  // namespace
-
 void FrontEnd::addPreProcessStages(const Model& model) {
     VPU_PROFILE(addPreProcessStages);
 
@@ -131,7 +53,7 @@ void FrontEnd::addPreProcessStages(const Model& model) {
             const auto meanImage = model->addConstData(
                 input->name() + "@mean-image",
                 input->desc(),
-                std::make_shared<MeanImageContent>(preProcess));
+                std::make_shared<MeanImageContent>(preProcess, input->desc()));
 
             const auto newInput = model->duplicateData(
                 input,
index 5551c70..15b95fa 100644 (file)
@@ -7,7 +7,6 @@
 #include "graph_transformer.h"
 
 #include "cnn_network_impl.hpp"
-#include "cnn_network_ngraph_impl.hpp"
 
 namespace vpu {
 
@@ -19,16 +18,7 @@ void FrontEnd::removeConstLayers(ie::ICNNNetwork& network) {
     env.log->trace("Remove const layers");
     VPU_LOGGER_SECTION(env.log);
 
-    ie::ICNNNetwork* cnnNetwork = &network;
-    if (auto nGraphImpl = dynamic_cast<ie::details::CNNNetworkNGraphImpl*>(&network)) {
-        // NGraph implementation cannot be casted to CNNNetworkImpl directly
-        cnnNetwork = nGraphImpl->getCNNNetwork().get();
-    }
-
-    // valid for CNNNetworkImpl only, while there's no API in ICNNNetwork to change network
-    if (auto cnnNetworkImpl = dynamic_cast<ie::details::CNNNetworkImpl*>(cnnNetwork)) {
-        ie::ConstTransformer(cnnNetworkImpl).fullTrim();
-    }
+    ie::ConstTransformer(&network).fullTrim();
 }
 
 }  // namespace vpu
index d30fb3b..a20de0c 100644 (file)
@@ -65,7 +65,7 @@ void updateChildDataAllocation(const Data& data, int offsetLimitation) {
         auto parent = edge->parent();
         auto child = edge->child();
 
-        auto memoryOffset = parent->memoryOffset();
+        auto memoryOffset = parent->dataLocation().offset;
 
         if (edge->mode() == SharedDataMode::ROI) {
             auto parentStrides = parent->strides();
@@ -86,7 +86,7 @@ void updateChildDataAllocation(const Data& data, int offsetLimitation) {
             IE_ASSERT(false) << "Unsupported enum value";
         }
 
-        child->setAllocationInfo(parent->location(), memoryOffset);
+        child->setDataAllocationInfo({parent->dataLocation().location, memoryOffset});
 
         updateChildDataAllocation(child, offsetLimitation);
     }
@@ -127,7 +127,7 @@ bool Allocator::allocateData(const Data& data) {
 
             auto finalByteSize = data->totalByteSize() * _modelBatchSize;
 
-            data->setIOInfo(DataLocation::Input, alignVal(_inputMemOffset, DATA_ALIGNMENT));
+            data->setIOInfo(Location::Input, alignVal(_inputMemOffset, DATA_ALIGNMENT));
             _inputMemOffset = alignVal(_inputMemOffset, DATA_ALIGNMENT) + finalByteSize;
 
             updateChildDataAllocation(data, DDR_MAX_SIZE);
@@ -153,7 +153,7 @@ bool Allocator::allocateData(const Data& data) {
                 finalByteSize = data->totalByteSize() * _modelBatchSize;
             }
 
-            data->setIOInfo(DataLocation::Output, alignVal(_outputMemOffset, DATA_ALIGNMENT));
+            data->setIOInfo(Location::Output, alignVal(_outputMemOffset, DATA_ALIGNMENT));
             _outputMemOffset = alignVal(_outputMemOffset, DATA_ALIGNMENT) + finalByteSize;
 
             updateChildDataAllocation(data, DDR_MAX_SIZE);
@@ -176,7 +176,7 @@ bool Allocator::allocateData(const Data& data) {
 
             auto finalByteSize = calcAllocationSize(data);
 
-            data->setAllocationInfo(DataLocation::Blob, _blobMemOffset);
+            data->setDataAllocationInfo({Location::Blob, _blobMemOffset});
             _blobMemOffset += finalByteSize;
 
             updateChildDataAllocation(data, DDR_MAX_SIZE);
@@ -257,9 +257,9 @@ bool Allocator::allocateData(const Data& data) {
     // Update data allocation info
     //
 
-    data->setAllocationInfo(chunk->memType == MemoryType::CMX ? DataLocation::CMX : DataLocation::BSS, chunk->pointer);
+    data->setDataAllocationInfo({chunk->memType == MemoryType::CMX ? Location::CMX : Location::BSS, chunk->pointer});
 
-    auto offsetLimitation = (data->location() == DataLocation::CMX) ? _maxCmxSize : DDR_MAX_SIZE;
+    auto offsetLimitation = (data->dataLocation().location == Location::CMX) ? _maxCmxSize : DDR_MAX_SIZE;
     updateChildDataAllocation(data, offsetLimitation);
 
     _memChunksPerData.emplace(data, chunk);
@@ -268,6 +268,23 @@ bool Allocator::allocateData(const Data& data) {
     return chunk->memType == memoryType;
 }
 
+ShapeLocation Allocator::allocateConstShape(Data& data) {
+    ShapeLocation shapeLocation;
+
+    shapeLocation.dimsLocation = Location::Blob;
+    shapeLocation.stridesLocation = Location::Blob;
+
+    const auto dimsByteSize = data->desc().dimsByteSize();
+
+    shapeLocation.dimsOffset = _blobMemOffset;
+    _blobMemOffset += dimsByteSize;
+
+    shapeLocation.stridesOffset = _blobMemOffset;
+    _blobMemOffset += dimsByteSize;
+
+    return shapeLocation;
+}
+
 void Allocator::freeData(const Data& data, DeallocationMode mode) {
     //
     // Release the chunk
@@ -313,7 +330,7 @@ void Allocator::freeData(const Data& data, DeallocationMode mode) {
 
             _memChunksPerData[data] = ddrChunk;
 
-            data->setAllocationInfo(DataLocation::BSS, ddrChunk->pointer);
+            data->setDataAllocationInfo({Location::BSS, ddrChunk->pointer});
             updateChildDataAllocation(data, DDR_MAX_SIZE);
 
             break;
index d6c5e9d..40831e6 100644 (file)
@@ -4,6 +4,16 @@
 
 #include <vpu/middleend/hw/conv_tiling/hw_stage_tiler.hpp>
 
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/stages/mx_stage.hpp>
+#include <vpu/middleend/hw/tiling.hpp>
+#include <vpu/middleend/hw/utility.hpp>
+#include <vpu/utils/attributes_map.hpp>
+#include <vpu/model/data_contents/hw_weights_content.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+#include <vpu/model/data_contents/replicated_data_content.hpp>
+#include <vpu/model/data_contents/scaled_content.hpp>
+
 #include <precision_utils.h>
 #include <memory>
 #include <list>
 #include <unordered_map>
 #include <set>
 
-#include <vpu/stages/stub_stage.hpp>
-#include <vpu/stages/mx_stage.hpp>
-#include <vpu/middleend/hw/tiling.hpp>
-#include <vpu/middleend/hw/utility.hpp>
-#include <vpu/utils/attributes_map.hpp>
-
 namespace vpu {
 
 namespace {
@@ -149,7 +153,7 @@ Data HWConvStageTiler::createScales(const HwConvTilingPtr& tiling, const HWConvS
                 hwScales = _model->addConstData(
                     _original->name() + "@scales",
                     DataDesc({maxExtendedOutputDimC}),
-                    replicateContent(stageOptions.reluScale, maxExtendedOutputDimC));
+                    replicateContent(stageOptions.reluScale, maxExtendedOutputDimC, DataDesc{maxExtendedOutputDimC}));
             } else {
                 hwScales = _model->addFakeData();
             }
@@ -363,6 +367,7 @@ Data HWConvStageTiler::createConstTileWeights(const HwConvChannelTilePtr& channe
         const auto content = std::make_shared<HwWeightsContent>(
             io.origWeights->content(),
             io.origWeights->desc(),
+            descriptor,
             channelTile->numInputChannels,
             channelTile->channelStartIndex);
 
index aeb0663..2824103 100644 (file)
@@ -79,95 +79,6 @@ void printTo(DotLabel& lbl, const HwPaddingInfo& hwPad) {
     }
 }
 
-//
-// HwWeightsContent
-//
-
-HwWeightsContent::HwWeightsContent(const DataContent::Ptr& origContent,
-        const DataDesc& origWeightsDesc,
-        int numInputChannels,
-        int channelStartIndex) :
-        CalculatedDataContent({origContent}),
-        _origWeightsDesc(origWeightsDesc),
-        _numInputChannels(numInputChannels),
-        _channelStartIndex(channelStartIndex) {
-}
-
-void HwWeightsContent::fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const {
-    VPU_PROFILE(HwWeightsContent);
-
-    IE_ASSERT(desc().type() == DataType::FP16);
-    IE_ASSERT(baseContents.size() == 1);
-
-    auto KX = _origWeightsDesc.dim(Dim::W);
-    auto KY = _origWeightsDesc.dim(Dim::H);
-    auto IC = _origWeightsDesc.dim(Dim::C);
-    auto OC = _origWeightsDesc.dim(Dim::N);
-    auto origTotalSize = _origWeightsDesc.totalDimSize();
-
-    auto HW_OC_inner = desc().dim(Dim::W);
-    auto HW_OC_outer = desc().dim(Dim::N);
-    IE_ASSERT(HW_OC_outer * HW_OC_inner >= OC);
-
-    auto HW_K = desc().dim(Dim::H);
-    IE_ASSERT(HW_K == KX * KY);
-
-    IE_ASSERT(_channelStartIndex < IC);
-    auto HW_IC = desc().dim(Dim::C);
-    auto HW_IC_real = std::min(_numInputChannels, IC - _channelStartIndex);
-
-    auto srcData = baseContents[0]->get<fp16_t>();
-    IE_ASSERT(srcData != nullptr);
-
-    auto dstData = static_cast<fp16_t*>(tempBuf);
-
-    IE_ASSERT((_channelStartIndex + HW_IC_real) * HW_K + (OC - 1) * HW_K * IC - 1 < origTotalSize);
-    IE_ASSERT((OC - 1) % HW_OC_inner +
-              (HW_K - 1) * HW_OC_inner +
-              (HW_IC_real - 1) * HW_OC_inner * HW_K +
-              ((OC - 1) / 8) * HW_OC_inner * HW_K * HW_IC < desc().totalDimSize());
-
-    if (KX == 1 && KY == 1) {
-        ie::parallel_for(OC, [=](int oc) {
-            auto oc_inner = oc % HW_OC_inner;
-            auto oc_outer = oc / HW_OC_inner;
-            for (int ic = 0; ic < HW_IC_real; ++ic) {
-                auto srcInd =
-                        (_channelStartIndex + ic) +
-                        oc * IC;
-                auto dstInd =
-                        oc_inner +
-                        ic * HW_OC_inner * HW_K +
-                        oc_outer * HW_OC_inner * HW_K * HW_IC;
-
-                dstData[dstInd] = srcData[srcInd];
-            }
-        });
-    } else {
-        ie::parallel_for(OC, [=](int oc) {
-            auto oc_inner = oc % HW_OC_inner;
-            auto oc_outer = oc / HW_OC_inner;
-            for (int ic = 0; ic < HW_IC_real; ++ic) {
-                for (int ky = 0; ky < KY; ++ky) {
-                    for (int kx = 0; kx < KX; ++kx) {
-                        auto srcInd =
-                                (kx + ky * KX) +
-                                (_channelStartIndex + ic) * HW_K +
-                                oc * HW_K * IC;
-                        auto dstInd =
-                                oc_inner +
-                                (ky * KX + kx) * HW_OC_inner +
-                                ic * HW_OC_inner * HW_K +
-                                oc_outer * HW_OC_inner * HW_K * HW_IC;
-
-                        dstData[dstInd] = srcData[srcInd];
-                    }
-                }
-            }
-        });
-    }
-}
-
 int calculateHwBufferSize(const DimValues& dims, const DimsOrder& order) {
     const auto desc = DataDesc{DataType::FP16, order.empty() ? DimsOrder::fromNumDims(dims.size()) : order, dims};
     IE_ASSERT(desc.numDims() > 2 || desc.dimsOrder() == DimsOrder::NC);
index 0250216..8fb1480 100644 (file)
@@ -4,6 +4,7 @@
 
 #include "vpu/stages/iteration_rule.hpp"
 #include "vpu/middleend/pass_manager.hpp"
+#include "vpu/model/data_contents/replicated_data_content.hpp"
 
 #include <utility>
 #include <string>
index cfbcb9f..7381360 100644 (file)
@@ -358,9 +358,9 @@ void PassImpl::copyHwMisalignedInput(const Model& model) {
 
         auto inputEdge = stage->inputEdge(0);
         auto input = inputEdge->input();
-        IE_ASSERT(input->location() != DataLocation::None);
+        IE_ASSERT(input->dataLocation().location != Location::None);
 
-        if (input->memoryOffset() % 16 != 0) {
+        if (input->dataLocation().offset % 16 != 0) {
             env.log->trace("HW Stage [%s] input [%s]", stage->name(), input->name());
 
             auto newInput = model->duplicateData(
index 6a099a6..b664eb6 100644 (file)
@@ -184,6 +184,15 @@ AllocationResult runAllocator(const Model& model, bool onlyCheckCMX) {
         }
     }
 
+    //
+    // Allocate shape for all datas
+    //
+
+    for (auto data : model->datas()) {
+        const auto shapeLocation = allocator.allocateConstShape(data);
+        data->setShapeAllocationInfo(shapeLocation);
+    }
+
     return AllocationResult();
 }
 
index f65c559..d465ce2 100644 (file)
@@ -4,12 +4,13 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
-#include <vector>
-#include <memory>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
 
 #include <blob_factory.hpp>
 
-#include <vpu/middleend/sw/utility.hpp>
+#include <vector>
+#include <memory>
 
 namespace vpu {
 
index 45aa93e..afbf7ed 100644 (file)
@@ -41,7 +41,7 @@ void PassImpl::run(const Model& model) {
         });
 
         if (memoryType == MemoryType::CMX) {
-            IE_ASSERT(topParent->location() == DataLocation::CMX);
+            IE_ASSERT(topParent->dataLocation().location == Location::CMX);
         }
 
         //
index 90f009a..4066905 100644 (file)
@@ -4,9 +4,6 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
-#include <precision_utils.h>
-#include <ie_parallel.hpp>
-
 #include <vpu/compile_env.hpp>
 #include <vpu/stages/stub_stage.hpp>
 #include <vpu/stages/mx_stage.hpp>
 #include <vpu/middleend/hw/utility.hpp>
 #include <vpu/middleend/hw/conv_tiling/hw_convolution_tiler.hpp>
 #include <vpu/middleend/hw/conv_tiling/hw_stage_tiler.hpp>
+#include <vpu/model/data_contents/hw_const_data_content.hpp>
+
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
 
 #include <utility>
 #include <memory>
@@ -29,35 +30,6 @@ namespace vpu {
 
 namespace {
 
-struct Slice {
-    int start;
-    size_t size;
-
-    Slice(int start, size_t size) :
-        start(start),
-        size(size) {}
-};
-
-struct DataSlice {
-    Data data;
-    Slice slice;
-
-    DataSlice(Data data, Slice slice) :
-        data(std::move(data)),
-        slice(slice) {}
-};
-
-using DataSlices = std::vector<DataSlice>;
-
-struct ConvTileSlice {
-    HwConvTileInfo tile;
-    Slice slice;
-
-    ConvTileSlice(HwConvTileInfo tile, Slice slice) :
-        tile(tile),
-        slice(slice) {}
-};
-
 class PassImpl final : public Pass {
 public:
     explicit PassImpl(StageBuilder::Ptr stageBuilder) : _stageBuilder(std::move(stageBuilder)) {}
@@ -133,7 +105,7 @@ private:
             if (infoData1 != infoData2)
                 return infoData1 < infoData2;
 
-            const auto size = data1->content()->desc().totalDimSize();
+            const auto size = data1->content()->byteSize() / sizeof(fp16_t);
 
             const auto content1 = data1->content()->get<fp16_t>();
             const auto content2 = data2->content()->get<fp16_t>();
@@ -144,98 +116,6 @@ private:
     std::map<Data, DataSlices, LexicographicalCompareByData> _splitConstData;
 };
 
-class HwConstData final : public CalculatedDataContent {
-public:
-    HwConstData(
-        const DataContent::Ptr& origContent,
-        const DataDesc& origDesc,
-        const std::map<Dim, Slice> dimSlices) :
-            CalculatedDataContent({origContent}),
-            _origDesc(origDesc),
-            _dimSlices(dimSlices) {}
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* outBuf) const override {
-        VPU_PROFILE(HwConstData);
-
-        VPU_THROW_UNLESS(
-            desc().type() == DataType::FP16,
-            "Constant data has %v data type while only %v is supported",
-            desc().type(), DataType::FP16);
-
-        VPU_THROW_UNLESS(baseContents.size() == 1,
-            "Missing source buffer for constant data");
-
-        const auto srcData = baseContents[0]->get<fp16_t>();
-        auto dstData = static_cast<fp16_t*>(outBuf);
-
-        VPU_THROW_UNLESS(srcData != nullptr,
-            "Source buffer for constant data has null address");
-
-        auto getDimSlice = [this](const Dim dim) {
-            auto it = _dimSlices.find(dim);
-            if (it != _dimSlices.end()) {
-                return it->second;
-            }
-
-            const int startInd = 0;
-            const size_t size = _origDesc.dim(dim);
-
-            return Slice(startInd, size);
-        };
-
-        if (_origDesc.numDims() == 4) {
-            Slice slice = getDimSlice(Dim::N);
-
-            int startOC = slice.start;
-            size_t numOC = slice.size;
-
-            const auto IC = _origDesc.dim(Dim::C);
-            const auto K = _origDesc.dim(Dim::H);
-            const auto V = _origDesc.dim(Dim::W);
-
-            const auto kernelStride     = V;
-            const auto inChannelStride  = K * kernelStride;
-            const auto outerStride      = IC * inChannelStride;
-
-            ie::parallel_for(numOC, [=](int oc) {
-                const auto ocSlice = oc;
-                oc += startOC;
-
-                const auto ocInner = oc % V;
-                const auto ocOuter = oc / V;
-                const auto ocSliceInner = ocSlice % V;
-                const auto ocSliceOuter = ocSlice / V;
-
-                const auto ocSrc = ocInner + ocOuter * outerStride;
-                const auto ocDst = ocSliceInner + ocSliceOuter * outerStride;
-
-                for (int ic = 0; ic < IC; ++ic)
-                    for (int k = 0; k < K; ++k) {
-                        const auto srcInd = ocSrc +
-                                            k * kernelStride +
-                                            ic * inChannelStride;
-                        const auto dstInd = ocDst +
-                                            k * kernelStride +
-                                            ic * inChannelStride;
-
-                        dstData[dstInd] = srcData[srcInd];
-                    }
-            });
-        } else if (_origDesc.numDims() == 1) {
-            Slice slice = getDimSlice(Dim::C);
-
-            std::copy(srcData + slice.start, srcData + slice.start + slice.size, dstData);
-        } else {
-            THROW_IE_EXCEPTION << "Invalid number of dimensions " << _origDesc.numDims();
-        }
-    }
-
-private:
-    DataDesc _origDesc;
-    std::map<Dim, Slice> _dimSlices;
-};
-
 void PassImpl::run(const Model& model) {
     VPU_PROFILE(hwExtraSplit);
 
@@ -444,6 +324,7 @@ Data PassImpl::splitWeights(
     const auto content = std::make_shared<HwConstData>(
         weights->content(),
         weights->desc(),
+        weightsDesc,
         dimSlices);
 
     weightsDesc.setDim(Dim::N, alignVal(numChannels, 8) / vectorSize);
@@ -474,6 +355,7 @@ Data PassImpl::splitBiases(
     const auto biasesContent = std::make_shared<HwConstData>(
         biases->content(),
         biases->desc(),
+        newBiasesDesc,
         dimSlices);
     const auto newBiases = model->duplicateData(biases, postfix, newBiasesDesc, biasesContent);
 
@@ -502,6 +384,7 @@ Data PassImpl::splitScales(
     const auto scalesContent = std::make_shared<HwConstData>(
         scales->content(),
         scales->desc(),
+        newScalesDesc,
         dimSlices);
     const auto newScales = model->duplicateData(scales, postfix, newScalesDesc, scalesContent);
 
index fd1a7e0..c343b46 100644 (file)
@@ -4,8 +4,17 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
-#include <cmath>
+#include <vpu/compile_env.hpp>
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/stages/mx_stage.hpp>
+#include <vpu/middleend/hw/tiling.hpp>
+#include <vpu/middleend/hw/utility.hpp>
+#include <vpu/model/data_contents/hw_weights_content.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
+#include <precision_utils.h>
 
+#include <cmath>
 #include <tuple>
 #include <vector>
 #include <limits>
 #include <set>
 #include <array>
 
-#include <precision_utils.h>
-
-#include <vpu/compile_env.hpp>
-#include <vpu/stages/stub_stage.hpp>
-#include <vpu/stages/mx_stage.hpp>
-#include <vpu/middleend/hw/tiling.hpp>
-#include <vpu/middleend/hw/utility.hpp>
-
 namespace vpu {
 
 namespace {
@@ -190,6 +191,7 @@ Data createHWWeights(const Model& model, const Stage& original, int hwInputDimC,
 
         const auto& content = std::make_shared<HwWeightsContent>(
             origWeights->content(),
+            dataDescriptor,
             contentDescriptor,
             extendedHWInputDimC);
 
index 514a23e..571c690 100644 (file)
@@ -2,62 +2,20 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include <memory>
-#include <utility>
-#include <vector>
+#include <vpu/middleend/pass_manager.hpp>
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/model/data_contents/merge_fc_content.hpp>
 
 #include <ie_parallel.hpp>
 
-#include <vpu/middleend/pass_manager.hpp>
-#include <vpu/stages/stub_stage.hpp>
+#include <memory>
+#include <utility>
+#include <vector>
 
 namespace vpu {
 
 namespace {
 
-class MergeFullyConnectedContentsByChannels final : public CalculatedDataContent {
-public:
-    explicit MergeFullyConnectedContentsByChannels(const SmallVector<DataContent::Ptr, 2>& contents) :
-        CalculatedDataContent(contents) {}
-
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& contents, void* temp) const override {
-        IE_ASSERT(!contents.empty());
-        // vpu::DataNode has content and vpu::DataDesc with dimensions' vector
-        // content has dimensions's vector as well
-        // they can be different so we extract channels number from contents
-        const auto dstC = std::accumulate(contents.begin(), contents.end(), 0, [](int reduction, const DataContent::Ptr& content) {
-            return reduction + content->desc().dims()[Dim::C];});
-
-        for (std::size_t i = 0, dstChannelsOffset = 0; i < contents.size(); ++i) {
-            const auto& content = contents[i];
-            const auto& srcDesc = content->desc();
-
-            const auto& srcDims = srcDesc.dims();
-            const auto& elemSize = srcDesc.elemSize();
-
-            const auto N = srcDims.get(Dim::N, 1);
-            const auto H = srcDims.get(Dim::H, 1);
-            const auto W = srcDims.get(Dim::W, 1) * elemSize;
-
-            const auto& srcC = srcDims[Dim::C];
-
-            const auto src = content->get<uint8_t>();
-                  auto dst = static_cast<uint8_t*>(temp);
-
-            InferenceEngine::parallel_for4d(N, srcC, H, W, [dstChannelsOffset, N, H, W, src, dst, srcC, dstC](int n, int c, int h, int w) {
-                const auto& srcc = c;
-                const auto& dstc = dstChannelsOffset + c;
-
-                const auto& srcOffset = n * H * W * srcC + srcc * H * W + h * W + w;
-                const auto& dstOffset = n * H * W * dstC + dstc * H * W + h * W + w;
-                dst[dstOffset] = src[srcOffset];
-            });
-
-            dstChannelsOffset += srcC;
-        }
-    }
-};
-
 DataDesc mergeDescriptors(const DataVector& dataObjects) {
     const auto& targetDim = Dim::C;
     auto mergedDescriptor = dataObjects.front()->desc();
@@ -72,13 +30,17 @@ Data mergeConstDataObjects(const Model& model, const DataVector& dataObjects) {
         return model->addFakeData();
     }
 
-    std::vector<DataContent::Ptr> contents;
+    std::vector<DataContent::CPtr> contents;
+    std::vector<DataDesc> descs;
     for (const auto& data : dataObjects) {
         contents.push_back(data->content());
+        descs.push_back(data->desc());
     }
 
-    auto content = std::make_shared<MergeFullyConnectedContentsByChannels>(contents);
-    return model->duplicateData(dataObjects.front(), "@merge-parallel-fc", mergeDescriptors(dataObjects), content);
+    auto mergedDesc = mergeDescriptors(dataObjects);
+
+    auto content = std::make_shared<MergeFullyConnectedContentsByChannels>(contents, descs, mergedDesc);
+    return model->duplicateData(dataObjects.front(), "@merge-parallel-fc", mergedDesc, content);
 }
 
 Data mergeOutputs(const Model& model, const DataVector& dataObjects) {
index 9bf073f..47b2ecf 100644 (file)
@@ -4,6 +4,11 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/model/data_contents/deconvolution_contents.hpp>
+
 #include <tuple>
 #include <vector>
 #include <algorithm>
 #include <unordered_map>
 #include <memory>
 
-#include <vpu/stages/stub_stage.hpp>
-#include <vpu/middleend/sw/utility.hpp>
-#include <vpu/compile_env.hpp>
-
 namespace vpu {
 
 namespace {
@@ -91,25 +92,6 @@ private:
     }
 };
 
-
-class DeconvolutionToConvolutionContent final : public CalculatedDataContent {
-public:
-    DeconvolutionToConvolutionContent(
-            const DataContent::Ptr& origContent) :
-            CalculatedDataContent({origContent}) {
-    }
-
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const {
-        VPU_PROFILE(DeconvolutionToConvolutionContent);
-
-        IE_ASSERT(baseContents.size() == 1);
-        IE_ASSERT(desc().type() == DataType::FP16);
-
-        deconv_to_conv(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
-    }
-};
-
-
 class PassImpl final : public Pass {
 public:
     explicit PassImpl(const StageBuilder::Ptr& stageBuilder) : _stageBuilder(stageBuilder) {}
@@ -192,7 +174,7 @@ void PassImpl::run(const Model& model) {
 
         auto newOutput = model->duplicateData(output, "@upsampleData", newDesc);
         auto newWeights = model->duplicateData(weights, "@upsampleData", weights->desc(),
-                     std::make_shared<DeconvolutionToConvolutionContent>(weights->content()));
+                     std::make_shared<DeconvolutionToConvolutionContent>(weights->content(), weights->desc()));
 
         auto upsampleStage = model->addNewStage<UpsamplingStage>(
                 stage->origLayerName() + "@Upsample",
index 95cd18d..1f9875f 100644 (file)
@@ -4,6 +4,12 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
+#include <vpu/stages/stub_stage.hpp>
+#include <vpu/model/data_contents/priorbox_contents.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
 #include <cmath>
 
 #include <algorithm>
 #include <vector>
 #include <queue>
 
-#include <ie_parallel.hpp>
-#include <precision_utils.h>
-
-#include <vpu/stages/stub_stage.hpp>
-
 namespace vpu {
 
 namespace {
 
-class PriorBoxContent final : public CalculatedDataContent {
-public:
-    PriorBoxContent(
-        const DataDesc& inDesc0,
-        const DataDesc& inDesc1,
-        const DataDesc& outDesc,
-        const ie::CNNLayerPtr &layer) :
-        _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
-        _layer(layer) {
-        IE_ASSERT(layer != nullptr);
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2> &, void *tempBuf) const override {
-        VPU_PROFILE(PriorBoxContent);
-
-        auto tempPtr = static_cast<fp16_t*>(tempBuf);
-
-        auto _min_sizes = _layer->GetParamAsFloats("min_size", {});
-        auto _max_sizes = _layer->GetParamAsFloats("max_size", {});
-        auto aspect_ratios = _layer->GetParamAsFloats("aspect_ratio");
-        auto _flip = static_cast<bool>(_layer->GetParamAsInt("flip"));
-        auto _clip = static_cast<bool>(_layer->GetParamAsInt("clip"));
-        auto _variance = _layer->GetParamAsFloats("variance");
-        auto _img_h = _layer->GetParamAsInt("img_h", 0);
-        auto _img_w = _layer->GetParamAsInt("img_w", 0);
-        auto _step = _layer->GetParamAsFloat("step", 0);
-        auto _offset = _layer->GetParamAsFloat("offset", 0);
-        auto _scale_all_sizes = static_cast<bool>(_layer->GetParamAsInt("scale_all_sizes", 1));
-
-        auto _fixed_sizes = _layer->GetParamAsFloats("fixed_size", {});
-        auto _fixed_ratios = _layer->GetParamAsFloats("fixed_ratio", {});
-        auto _densitys = _layer->GetParamAsFloats("density", {});
-
-        SmallVector<float> _aspect_ratios;
-        _aspect_ratios.reserve(aspect_ratios.size() + 1);
-
-        _aspect_ratios.push_back(1.0f);
-        for (const auto& aspect_ratio : aspect_ratios) {
-            bool exist = false;
-
-            for (const auto& _aspect_ratio : _aspect_ratios) {
-                if (fabsf(aspect_ratio - _aspect_ratio) < 1e-6) {
-                    exist = true;
-                    break;
-                }
-            }
-            if (!exist) {
-                _aspect_ratios.push_back(aspect_ratio);
-                if (_flip) {
-                    if (isFloatEqual(aspect_ratio, 0.f)) {
-                        THROW_IE_EXCEPTION << "[VPU] PriorBox has 0.0 aspect ratio param in flip mode, "
-                                           << " possible division by zero";
-                    }
-                    _aspect_ratios.push_back(1.0f / aspect_ratio);
-                }
-            }
-        }
-
-        int _num_priors;
-        if (_scale_all_sizes) {
-            _num_priors = static_cast<int>(_aspect_ratios.size() * _min_sizes.size());
-        } else {
-            _num_priors = static_cast<int>(_aspect_ratios.size() + _min_sizes.size() - 1);
-        }
-
-        if (!_fixed_sizes.empty()) {
-            _num_priors = static_cast<int>(_aspect_ratios.size() * _fixed_sizes.size());
-        }
-
-        if (!_densitys.empty()) {
-            for (const auto& _density : _densitys) {
-                if (!_fixed_ratios.empty()) {
-                    _num_priors += _fixed_ratios.size() * (static_cast<int>(pow(_density, 2)) - 1);
-                } else {
-                    _num_priors += _aspect_ratios.size() * (static_cast<int>(pow(_density, 2)) - 1);
-                }
-            }
-        }
-
-        _num_priors += _max_sizes.size();
-
-        auto W  = _inDesc0.dim(Dim::W);
-        auto H  = _inDesc0.dim(Dim::H);
-        auto IW = _img_w == 0 ? _inDesc1.dim(Dim::W) : _img_w;
-        auto IH = _img_h == 0 ? _inDesc1.dim(Dim::H) : _img_h;
-        auto IWI = 1.0f / static_cast<float>(IW);
-        auto IHI = 1.0f / static_cast<float>(IH);
-
-        auto OW = (_outDesc.numDims() >= 4) ? _outDesc.dim(Dim::N) : 1;
-        auto OH = _outDesc.dim(Dim::W);
-
-        float step_x = 0.0f;
-        float step_y = 0.0f;
-
-        if (_step == 0) {
-            step_x = static_cast<float>(IW) / W;
-            step_y = static_cast<float>(IH) / H;
-        } else {
-            step_x = _step;
-            step_y = _step;
-        }
-
-        auto dst_data = tempPtr;
-
-        int dim = H * W * _num_priors * 4;
-        float center_x = 0.0f;
-        float center_y = 0.0f;
-
-        float box_width = 0.0f;
-        float box_height = 0.0f;
-
-        if (_outDesc.dim(Dim::W) != dim || _outDesc.dim(Dim::H) != 2) {
-            THROW_IE_EXCEPTION << "[VPU] PriorBox output have invalid dimension, exptected " << dim << "x2"
-                               << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H)
-                               << ", layer name is: " << _layer->name;
-        }
-
-        auto max_fp16 = [](const float value, const float min) {
-            return ie::PrecisionUtils::f32tof16(value > min ? value : min);
-        };
-
-        auto min_fp16 = [](const float value, const float max) {
-            return ie::PrecisionUtils::f32tof16(value < max ? value : max);
-        };
-
-        size_t idx = 0;
-        for (int h = 0; h < H; ++h) {
-            for (int w = 0; w < W;  ++w) {
-                if (_step == 0) {
-                    center_x = (static_cast<float>(w) + 0.5f) * step_x;
-                    center_y = (static_cast<float>(h) + 0.5f) * step_y;
-                } else {
-                    center_x = (_offset + static_cast<float>(w)) * _step;
-                    center_y = (_offset + static_cast<float>(h)) * _step;
-                }
-
-                for (size_t s = 0; s < _fixed_sizes.size(); ++s) {
-                    auto fixed_size_ = static_cast<size_t>(_fixed_sizes[s]);
-                    box_width = box_height = fixed_size_ * 0.5f;
-
-                    int density_ = 0;
-                    int shift = 0;
-                    if (s < _densitys.size()) {
-                        density_ = static_cast<size_t>(_densitys[s]);
-                        shift = static_cast<int>(_fixed_sizes[s] / density_);
-                    }
-
-                    if (!_fixed_ratios.empty()) {
-                        for (const auto& fr : _fixed_ratios) {
-                            const auto box_width_ratio = _fixed_sizes[s] * 0.5f * std::sqrt(fr);
-                            const auto box_height_ratio = _fixed_sizes[s] * 0.5f / std::sqrt(fr);
-
-                            for (size_t r = 0; r < density_; ++r) {
-                                for (size_t c = 0; c < density_; ++c) {
-                                    const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
-                                    const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
-
-                                    dst_data[idx++] = max_fp16((center_x_temp - box_width_ratio) * IWI, 0.f);
-                                    dst_data[idx++] = max_fp16((center_y_temp - box_height_ratio) * IHI, 0.f);
-                                    dst_data[idx++] = min_fp16((center_x_temp + box_width_ratio) * IWI, 1.f);
-                                    dst_data[idx++] = min_fp16((center_y_temp + box_height_ratio) * IHI, 1.f);
-                                }
-                            }
-                        }
-                    } else {
-                        if (!_densitys.empty()) {
-                            for (int r = 0; r < density_; ++r) {
-                                for (int c = 0; c < density_; ++c) {
-                                    const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
-                                    const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
-
-                                    dst_data[idx++] = max_fp16((center_x_temp - box_width) * IWI, 0.f);
-                                    dst_data[idx++] = max_fp16((center_y_temp - box_height) * IHI, 0.f);
-                                    dst_data[idx++] = min_fp16((center_x_temp + box_width) * IWI, 1.f);
-                                    dst_data[idx++] = min_fp16((center_y_temp + box_height) * IHI, 1.f);
-                                }
-                            }
-                        }
-                        //  Rest of priors
-                        for (const auto& ar : _aspect_ratios) {
-                            if (fabs(ar - 1.) < 1e-6) {
-                                continue;
-                            }
-
-                            const auto box_width_ratio = _fixed_sizes[s] * 0.5f * std::sqrt(ar);
-                            const auto box_height_ratio = _fixed_sizes[s] * 0.5f / std::sqrt(ar);
-                            for (int r = 0; r < density_; ++r) {
-                                for (int c = 0; c < density_; ++c) {
-                                    const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
-                                    const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
-
-                                    dst_data[idx++] = max_fp16((center_x_temp - box_width_ratio) * IWI, 0.f);
-                                    dst_data[idx++] = max_fp16((center_y_temp - box_height_ratio) * IHI, 0.f);
-                                    dst_data[idx++] = min_fp16((center_x_temp + box_width_ratio) * IWI, 1.f);
-                                    dst_data[idx++] = min_fp16((center_y_temp + box_height_ratio) * IHI, 1.f);
-                                }
-                            }
-                        }
-                    }
-                }
-
-                for (size_t msIdx = 0; msIdx < _min_sizes.size(); msIdx++) {
-                    box_width = _min_sizes[msIdx];
-                    box_height = _min_sizes[msIdx];
-
-                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
-                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
-                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
-                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
-
-                    if (_max_sizes.size() > msIdx) {
-                        box_width = box_height = std::sqrt(_min_sizes[msIdx] * _max_sizes[msIdx]);
-
-                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
-                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
-                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
-                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
-                    }
-
-                    if (_scale_all_sizes || (!_scale_all_sizes && (msIdx == _min_sizes.size() - 1))) {
-                        size_t sIdx = _scale_all_sizes ? msIdx : 0;
-                        for (const auto& ar : _aspect_ratios) {
-                            if (std::fabs(ar - 1.0f) < 1e-6) {
-                                continue;
-                            }
-
-                            box_width = _min_sizes[sIdx] * std::sqrt(ar);
-                            box_height = _min_sizes[sIdx] / std::sqrt(ar);
-
-                            dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
-                            dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
-                            dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
-                            dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
-                        }
-                    }
-                }
-            }
-        }
-
-        if (_clip) {
-            for (int d = 0; d < dim; ++d) {
-                dst_data[d] = (std::min)((std::max)(dst_data[d], ie::PrecisionUtils::f32tof16(0.0f)), ie::PrecisionUtils::f32tof16(1.0f));
-            }
-        }
-
-        int channel_size = OH * OW;
-
-        dst_data += channel_size;
-
-        if (_variance.size() == 1) {
-            ie::parallel_for(channel_size, [&](int i) {
-                dst_data[i] = ie::PrecisionUtils::f32tof16(_variance[0]);
-            });
-        } else {
-            ie::parallel_for4d(H, W, _num_priors, 4, [&](int h, int w, int i, int j) {
-                dst_data[j + 4 * (i + _num_priors * (w + W * h))] = ie::PrecisionUtils::f32tof16(_variance[j]);
-            });
-        }
-    }
-
-private:
-    DataDesc _inDesc0;
-    DataDesc _inDesc1;
-    DataDesc _outDesc;
-    ie::CNNLayerPtr _layer;
-};
-
-class PriorBoxClusteredContent final : public CalculatedDataContent {
-public:
-    PriorBoxClusteredContent(
-        const DataDesc& inDesc0,
-        const DataDesc& inDesc1,
-        const DataDesc& outDesc,
-        const ie::CNNLayerPtr& layer) :
-        _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
-        _layer(layer) {
-        IE_ASSERT(layer != nullptr);
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>&, void* tempBuf) const override {
-        VPU_PROFILE(PriorBoxClusteredContent);
-
-        auto tempPtr = static_cast<fp16_t*>(tempBuf);
-
-        auto widths_ = _layer->GetParamAsFloats("width");
-        auto heights_ = _layer->GetParamAsFloats("height");
-        auto clip_ = _layer->GetParamAsInt("clip");
-        auto variance_ = _layer->GetParamAsFloats("variance");
-        auto img_h_ = _layer->GetParamAsInt("img_h", 0);
-        auto img_w_ = _layer->GetParamAsInt("img_w", 0);
-        auto step_ = _layer->GetParamAsFloat("step", 0);
-        auto step_h_ = _layer->GetParamAsFloat("step_h", 0);
-        auto step_w_ = _layer->GetParamAsFloat("step_w", 0);
-        auto offset_ = _layer->GetParamAsFloat("offset", 0);
-
-        auto num_priors_ = widths_.size();
-
-        if (variance_.empty()) {
-            variance_.push_back(0.1);
-        }
-
-        auto layer_width  = _inDesc0.dim(Dim::W);
-        auto layer_height = _inDesc0.dim(Dim::H);
-
-        auto img_width  = img_w_ == 0 ? _inDesc1.dim(Dim::W) : img_w_;
-        auto img_height = img_h_ == 0 ? _inDesc1.dim(Dim::H) : img_h_;
-
-        auto step_w = step_w_ == 0 ? step_ : step_w_;
-        auto step_h = step_h_ == 0 ? step_ : step_h_;
-        if (step_w == 0 || step_h == 0) {
-            step_w = static_cast<float>(img_width) / layer_width;
-            step_h = static_cast<float>(img_height) / layer_height;
-        }
-
-        auto expetected_output_dimx = layer_height * layer_width * num_priors_ * 4;
-        if (_outDesc.dim(Dim::W) != expetected_output_dimx || _outDesc.dim(Dim::H) != 2) {
-            THROW_IE_EXCEPTION << "PriorBoxClustered output has invalid dimension, exptected " << expetected_output_dimx << "x2"
-                               << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H) << ", layer name is: " << _layer->name;
-        }
-
-        auto offset = _outDesc.dim(Dim::W);
-        auto var_size = variance_.size();
-
-        auto top_data_0 = tempPtr;
-        auto top_data_1 = top_data_0 + offset;
-
-        ie::parallel_for2d(layer_height, layer_width, [=](int h, int w) {
-            auto center_x = (w + offset_) * step_w;
-            auto center_y = (h + offset_) * step_h;
-
-            for (int s = 0; s < num_priors_; ++s) {
-                auto box_width  = widths_[s];
-                auto box_height = heights_[s];
-
-                auto xmin = (center_x - box_width  / 2.0f) / img_width;
-                auto ymin = (center_y - box_height / 2.0f) / img_height;
-                auto xmax = (center_x + box_width  / 2.0f) / img_width;
-                auto ymax = (center_y + box_height / 2.0f) / img_height;
-
-                if (clip_) {
-                    xmin = std::min(std::max(xmin, 0.0f), 1.0f);
-                    ymin = std::min(std::max(ymin, 0.0f), 1.0f);
-                    xmax = std::min(std::max(xmax, 0.0f), 1.0f);
-                    ymax = std::min(std::max(ymax, 0.0f), 1.0f);
-                }
-
-                top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 0] = ie::PrecisionUtils::f32tof16(xmin);
-                top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 1] = ie::PrecisionUtils::f32tof16(ymin);
-                top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 2] = ie::PrecisionUtils::f32tof16(xmax);
-                top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 3] = ie::PrecisionUtils::f32tof16(ymax);
-
-                for (int j = 0; j < var_size; j++) {
-                    auto index = h * layer_width * num_priors_ * var_size + w * num_priors_ * var_size + s * var_size + j;
-                    top_data_1[index] = ie::PrecisionUtils::f32tof16(variance_[j]);
-                }
-            }
-        });
-    }
-
-private:
-    DataDesc _inDesc0;
-    DataDesc _inDesc1;
-    DataDesc _outDesc;
-    ie::CNNLayerPtr _layer;
-};
-
 //
 // UnusedDataRemover class deletes data that has no consumers,
 // and also recursively deletes all its unused predecessors, including
index c52d275..675fad4 100644 (file)
@@ -5,6 +5,7 @@
 #include <vpu/middleend/pass_manager.hpp>
 #include <vpu/middleend/sw/utility.hpp>
 #include <vpu/model/data.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
 
 #include <precision_utils.h>
 
index d5224a0..5ff9b9e 100644 (file)
@@ -4,6 +4,8 @@
 
 #include "vpu/middleend/pass_manager.hpp"
 #include "vpu/utils/numeric.hpp"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
+
 #include "precision_utils.h"
 #include "ie_memcpy.h"
 
index f0c6ea5..57d7d92 100644 (file)
@@ -4,13 +4,14 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
-#include <vector>
-#include <set>
-#include <memory>
+#include <vpu/utils/numeric.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
 
 #include <precision_utils.h>
 
-#include <vpu/utils/numeric.hpp>
+#include <vector>
+#include <set>
+#include <memory>
 
 namespace vpu {
 
index f03f90b..dfa85e1 100644 (file)
@@ -4,14 +4,15 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
+#include <vpu/compile_env.hpp>
+#include <vpu/middleend/hw/utility.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
 #include <vector>
 #include <set>
 #include <memory>
 #include <array>
 
-#include <vpu/compile_env.hpp>
-#include <vpu/middleend/hw/utility.hpp>
-
 namespace vpu {
 
 namespace {
index 908be06..7d93920 100644 (file)
@@ -4,6 +4,12 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
+#include <vpu/middleend/hw/tiling.hpp>
+#include <vpu/middleend/hw/utility.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
+#include <precision_utils.h>
+
 #include <memory>
 #include <array>
 #include <string>
 #include <tuple>
 #include <limits>
 
-#include <precision_utils.h>
-
-#include <vpu/middleend/hw/tiling.hpp>
-#include <vpu/middleend/hw/utility.hpp>
-
 namespace vpu {
 
 namespace {
index fe9c67a..2a5a550 100644 (file)
@@ -6,6 +6,7 @@
 #include "vpu/stage_builder.hpp"
 #include "vpu/utils/numeric.hpp"
 #include "precision_utils.h"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
 
 #include <memory>
 #include <set>
index 08a1de7..769949d 100644 (file)
@@ -3,6 +3,10 @@
 //
 
 #include <vpu/middleend/pass_manager.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/model/data_contents/conv_weights_contents.hpp>
+#include <vpu/model/data_contents/default_sw_weights_content.hpp>
+
 #include <limits>
 
 #include <vector>
 #include <unordered_set>
 #include <set>
 
-#include <vpu/middleend/sw/utility.hpp>
-
 #define REFERENCE_CONVOLUTION 0
 
 namespace vpu {
 
 namespace {
 
-class ConvIm2ColWeightsContent final : public CalculatedDataContent {
-public:
-    explicit ConvIm2ColWeightsContent(const DataContent::Ptr& origContent) :
-            CalculatedDataContent({origContent}) {
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(ConvIm2ColWeightsContent);
-        kchw_to_khwc(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
-    }
-};
-
-class Conv3x3WeightsContent final : public CalculatedDataContent {
-public:
-    explicit Conv3x3WeightsContent(const DataContent::Ptr& origContent) :
-            CalculatedDataContent({origContent}) {
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(Conv3x3WeightsContent);
-        kchw_to_hwkc(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
-    }
-};
-
-class ConvCHWWeightsContent final : public CalculatedDataContent {
-public:
-    explicit ConvCHWWeightsContent(const DataContent::Ptr& origContent) :
-            CalculatedDataContent({origContent}) {
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(ConvCHWWeightsContent);
-        kchw_to_hwkc(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
-    }
-};
-
 class ConvStage final : public StageNode {
 public:
     using StageNode::StageNode;
@@ -124,7 +87,7 @@ private:
                     weights,
                     "@SW",
                     newWeightsDesc,
-                    std::make_shared<DefaultSwWeightsContent>(weights->content()));
+                    std::make_shared<DefaultSwWeightsContent>(weights->content(), newWeightsDesc));
 
                 weights->attrs().set<Data>("swWeights", swWeights);
             }
@@ -149,7 +112,7 @@ private:
                         weights,
                         "@SW",
                         newWeightsDesc,
-                        std::make_shared<DefaultSwWeightsContent>(weights->content()));
+                        std::make_shared<DefaultSwWeightsContent>(weights->content(), newWeightsDesc));
                 } else if (isConv1x1) {
                     swWeights = model()->duplicateData(
                         weights,
@@ -161,13 +124,13 @@ private:
                         weights,
                         "@SW",
                         newWeightsDesc,
-                        std::make_shared<Conv3x3WeightsContent>(weights->content()));
+                        std::make_shared<Conv3x3WeightsContent>(weights->content(), newWeightsDesc));
                 } else {
                     swWeights = model()->duplicateData(
                         weights,
                         "@SW",
                         newWeightsDesc,
-                        std::make_shared<ConvIm2ColWeightsContent>(weights->content()));
+                        std::make_shared<ConvIm2ColWeightsContent>(weights->content(), newWeightsDesc));
 
                     double im2ColBufSizeF = static_cast<double>(kernelSizeX) * kernelSizeY *
                         output->desc().dim(Dim::W) * output->desc().dim(Dim::H) * input->desc().dim(Dim::C)
@@ -215,7 +178,7 @@ private:
                         weights,
                         "@SW",
                         newWeightsDesc,
-                        std::make_shared<ConvCHWWeightsContent>(weights->content()));
+                        std::make_shared<ConvCHWWeightsContent>(weights->content(), newWeightsDesc));
                 }
 
                 weights->attrs().set<Data>("swWeights", swWeights);
index e402756..a8d9553 100644 (file)
 
 #include <vpu/middleend/pass_manager.hpp>
 
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/utils/numeric.hpp>
+#include <vpu/model/data_contents/deconvolution_contents.hpp>
+
+#include <ie_parallel.hpp>
+
 #include <vector>
 #include <string>
 #include <memory>
 #include <unordered_set>
 #include <set>
 
-#include <ie_parallel.hpp>
-
-#include <vpu/middleend/sw/utility.hpp>
-#include <vpu/utils/numeric.hpp>
-
 namespace vpu {
 
 namespace {
 
-void depthDeconvolutionRelayoutCHW(
-        const fp16_t* src, int src_size,
-        fp16_t* dst, int dst_size,
-        int KX, int KY,
-        int channels) {
-    ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
-        int iidx = c * KX * KY + ky * KX + kx;
-        IE_ASSERT(iidx >= 0 && iidx < src_size);
-
-        int inv_kx = KX - kx - 1;
-        int inv_ky = KY - ky - 1;
-        int oidx = c * KX * KY + inv_ky * KX + inv_kx;
-        IE_ASSERT(oidx >= 0 && oidx < dst_size);
-
-        dst[oidx] = src[iidx];
-    });
-}
-
-class DepthDeconvolutionCHWWeightsContent final : public CalculatedDataContent {
-public:
-    DepthDeconvolutionCHWWeightsContent(
-            const DataContent::Ptr& origContent,
-            int KX, int KY, int channels) :
-            CalculatedDataContent({origContent}),
-            _KX(KX), _KY(KY), _channels(channels) {
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(DepthDeconvolutionCHWWeightsContent);
-        depthDeconvolutionRelayoutCHW(
-            baseContents[0]->get<fp16_t>(), desc().totalDimSize(),
-            static_cast<fp16_t*>(tempBuf), desc().totalDimSize(),
-            _KX, _KY, _channels);
-    }
-
-private:
-    int _KX;
-    int _KY;
-    int _channels;
-};
-
-void depthDeconvolutionRelayoutHWC(
-        const fp16_t* src, int src_size,
-        fp16_t* dst, int dst_size,
-        int KX, int KY,
-        int channels) {
-    ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
-        int iidx = c * KX * KY + ky * KX + kx;
-        IE_ASSERT(iidx < src_size);
-
-        int inv_kx = KX - kx - 1;
-        int inv_ky = KY - ky - 1;
-        int oidx = inv_ky * KX * channels + inv_kx * channels + c;
-        IE_ASSERT(oidx < dst_size);
-
-        dst[oidx] = src[iidx];
-    });
-}
-
-class DepthDeconvolutionHWCWeightsContent final : public CalculatedDataContent {
-public:
-    DepthDeconvolutionHWCWeightsContent(
-            const DataContent::Ptr& origContent,
-            int KX, int KY, int channels) :
-            CalculatedDataContent({origContent}),
-            _KX(KX), _KY(KY), _channels(channels) {
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(DepthDeconvolutionHWCWeightsContent);
-        depthDeconvolutionRelayoutHWC(
-            baseContents[0]->get<fp16_t>(), desc().totalDimSize(),
-            static_cast<fp16_t*>(tempBuf), desc().totalDimSize(),
-            _KX, _KY, _channels);
-    }
-
-private:
-    int _KX;
-    int _KY;
-    int _channels;
-};
-
-void deconvolutionRelayout(
-    const fp16_t* src, int src_size,
-    fp16_t* dst, int dst_size,
-    int KX, int KY,
-    int IC, int OC) {
-    ie::parallel_for4d(OC, IC, KY, KX, [=](int oc, int ic, int ky, int kx) {
-        int iidx = ic * OC * KY * KX
-                 + oc * KY * KX
-                 + ky * KX
-                 + kx;
-        IE_ASSERT(iidx >= 0 && iidx < src_size);
-
-        int inv_kx = KX - kx - 1;
-        int inv_ky = KY - ky - 1;
-        int oidx = oc * IC * KY * KX
-                 + ic * KY * KX
-                 + inv_ky * KX
-                 + inv_kx;
-        IE_ASSERT(oidx >=  0 && oidx < dst_size);
-
-        dst[oidx] = src[iidx];
-    });
-}
-
-class DeconvolutionWeightsContent final : public CalculatedDataContent {
-public:
-    DeconvolutionWeightsContent(
-            const DataContent::Ptr& origContent,
-            int KX, int KY,
-            int IC, int OC) :
-            CalculatedDataContent({origContent}),
-            _KX(KX), _KY(KY),
-            _IC(IC), _OC(OC) {
-    }
-
-protected:
-    size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const override {
-        return 2 * desc().totalDimSize() * sizeof(fp16_t);
-    }
-
-
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(DeconvolutionWeightsContent);
-
-        auto dstPtr = static_cast<fp16_t*>(tempBuf);
-        auto dstPtr2 = dstPtr + desc().totalDimSize();
-
-        deconvolutionRelayout(
-            baseContents[0]->get<fp16_t>(), desc().totalDimSize(),
-            dstPtr2, desc().totalDimSize(),
-            _KX, _KY,
-            _IC, _OC);
-
-        kchw_to_hwkc(dstPtr2, dstPtr, desc());
-    }
-
-private:
-    int _KX;
-    int _KY;
-    int _IC;
-    int _OC;
-};
-
 class DeconvStage final : public StageNode {
 public:
     using StageNode::StageNode;
@@ -287,6 +141,7 @@ private:
                     newWeightsDesc,
                     std::make_shared<DeconvolutionWeightsContent>(
                         weights->content(),
+                        newWeightsDesc,
                         kernelSizeX, kernelSizeY,
                         input->desc().dim(Dim::C),
                         output->desc().dim(Dim::C)));
index 61c11b8..fcbb10d 100644 (file)
@@ -4,13 +4,14 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/model/data_contents/default_sw_weights_content.hpp>
+
 #include <vector>
 #include <memory>
 #include <string>
 #include <set>
 
-#include <vpu/middleend/sw/utility.hpp>
-
 namespace vpu {
 
 namespace {
@@ -46,7 +47,7 @@ private:
                 weights,
                 "@SW",
                 weights->desc(),
-                std::make_shared<DefaultSwWeightsContent>(weights->content()));
+                std::make_shared<DefaultSwWeightsContent>(weights->content(), weights->desc()));
 
             weights->attrs().set<Data>("swWeights", swWeights);
         }
index d27f135..cffe70a 100644 (file)
@@ -4,6 +4,14 @@
 
 #include <vpu/middleend/pass_manager.hpp>
 
+#include <vpu/utils/numeric.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/model/data_contents/replicated_data_content.hpp>
+#include <vpu/model/data_contents/scaled_content.hpp>
+
+#include <details/caseless.hpp>
+#include <precision_utils.h>
+
 #include <cmath>
 
 #include <sstream>
 #include <list>
 #include <set>
 
-#include <precision_utils.h>
-
-#include <vpu/utils/numeric.hpp>
-#include <vpu/compile_env.hpp>
-
-#include <details/caseless.hpp>
-
 namespace vpu {
 
 namespace {
@@ -198,7 +199,9 @@ void addScaleInput(const Model& model, const Stage& stage, float scale) {
     IE_ASSERT(stage->output(0)->desc().dims().has(Dim::C));
     const auto outputChannels = stage->output(0)->desc().dims()[Dim::C];
 
-    auto scaleInput = model->addConstData(stage->name() + "@scales", DataDesc{{outputChannels}}, replicateContent(1.0f / scale, outputChannels));
+    auto scaleInput = model->addConstData(stage->name() + "@scales",
+                                          DataDesc{{outputChannels}},
+                                          replicateContent(1.0f / scale, outputChannels, DataDesc{outputChannels}));
     model->replaceStageInput(stage->inputEdge(SCALES_IDX), scaleInput);
 }
 
index aeae81a..645c69d 100644 (file)
 namespace vpu {
 
 //
-// DefaultSwWeightsContent
-//
-
-DefaultSwWeightsContent::DefaultSwWeightsContent(const DataContent::Ptr& origContent) :
-        CalculatedDataContent({origContent}) {
-}
-
-void DefaultSwWeightsContent::fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const {
-    VPU_PROFILE(DefaultSwWeightsContent);
-
-    IE_ASSERT(desc().type() == DataType::FP16);
-    IE_ASSERT(baseContents.size() == 1);
-
-    kchw_to_hwck(baseContents[0]->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), desc());
-}
-
-//
 // getOneOfSingleNextStage
 //
 
index 3c41b86..cbb6247 100644 (file)
@@ -4,6 +4,16 @@
 
 #include <vpu/model/data.hpp>
 
+#include <vpu/model/edges.hpp>
+#include <vpu/model/stage.hpp>
+#include <vpu/backend/backend.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/utils/numeric.hpp>
+#include <vpu/compile_env.hpp>
+
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
+
 #include <array>
 #include <algorithm>
 #include <queue>
 #include <set>
 #include <utility>
 
-#include <precision_utils.h>
-#include <ie_parallel.hpp>
-
-#include <vpu/model/edges.hpp>
-#include <vpu/model/stage.hpp>
-#include <vpu/backend/backend.hpp>
-#include <vpu/utils/ie_helpers.hpp>
-#include <vpu/utils/numeric.hpp>
-#include <vpu/compile_env.hpp>
-
 namespace vpu {
 
 //
-// DataContent
-//
-
-DataContent::~DataContent() = default;
-
-const void* CalculatedDataContent::getRaw() const {
-    if (_temp.empty()) {
-        _temp.resize(getTempBufSize(_baseContents));
-        fillTempBuf(_baseContents, _temp.data());
-        _baseContents.clear();
-    }
-    return _temp.data();
-}
-
-size_t CalculatedDataContent::getTempBufSize(const SmallVector<DataContent::Ptr, 2>&) const {
-    return checked_cast<size_t>(desc().totalDimSize()) *
-           checked_cast<size_t>(desc().elemSize());
-}
-
-namespace {
-
-class IeBlobContent final : public DataContent {
-public:
-    IeBlobContent(const ie::Blob::Ptr& blob, int repeat) : _blob(blob), _repeat(repeat) {}
-
-protected:
-    const void* getRaw() const override {
-        if (desc().type() == DataType::FP16) {
-            if (_blobFp16 == nullptr) {
-                _blobFp16 = getBlobFP16(_blob);
-                _blob.reset();
-            }
-
-            if (_repeat == 1) {
-                return _blobFp16->cbuffer();
-            } else {
-                if (_tempFp16.empty()) {
-                    VPU_PROFILE(IeBlobContent);
-
-                    IE_ASSERT(desc().totalDimSize() % _repeat == 0);
-
-                    auto origNumElems = desc().totalDimSize() / _repeat;
-                    IE_ASSERT(checked_cast<size_t>(origNumElems) <= _blobFp16->size());
-
-                    auto origPtr = _blobFp16->cbuffer().as<const fp16_t*>();
-                    IE_ASSERT(origPtr != nullptr);
-
-                    _tempFp16.resize(checked_cast<size_t>(desc().totalDimSize()));
-
-                    ie::parallel_for(_repeat, [this, origPtr, origNumElems](int i) {
-                        std::copy_n(origPtr, origNumElems, _tempFp16.data() + i * origNumElems);
-                    });
-                }
-
-                return _tempFp16.data();
-            }
-        } else if (desc().type() == DataType::S32) {
-            if (_repeat == 1) {
-                return _blob->cbuffer();
-            } else {
-                if (_tempS32.empty()) {
-                    VPU_PROFILE(IeBlobContent);
-
-                    IE_ASSERT(desc().totalDimSize() % _repeat == 0);
-
-                    auto origNumElems = desc().totalDimSize() / _repeat;
-                    IE_ASSERT(checked_cast<size_t>(origNumElems) <= _blob->size());
-
-                    auto origPtr = _blob->cbuffer().as<const int32_t*>();
-                    IE_ASSERT(origPtr != nullptr);
-
-                    _tempS32.resize(checked_cast<size_t>(desc().totalDimSize()));
-
-                    ie::parallel_for(_repeat, [this, origPtr, origNumElems](int i) {
-                        std::copy_n(origPtr, origNumElems, _tempS32.data() + i * origNumElems);
-                    });
-                }
-
-                return _tempS32.data();
-            }
-        } else {
-            VPU_THROW_EXCEPTION << "Unsupported data type " << desc().type();
-        }
-    }
-
-private:
-    mutable ie::Blob::Ptr _blob;
-    int _repeat = 0;
-
-    mutable ie::Blob::Ptr _blobFp16;
-    mutable std::vector<fp16_t> _tempFp16;
-    mutable std::vector<int32_t> _tempS32;
-};
-
-}  // namespace
-
-DataContent::Ptr ieBlobContent(const ie::Blob::Ptr& blob, int repeat) {
-    return std::make_shared<IeBlobContent>(blob, repeat);
-}
-
-namespace {
-
-class ReplicatedContent final : public CalculatedDataContent {
-public:
-    ReplicatedContent(float val, int count) : _factor{val}, _count(count) {}
-
-    ReplicatedContent(DataContent::Ptr origContent, int count) :
-        CalculatedDataContent({std::move(origContent)}), _count(count) {
-    }
-
-protected:
-    size_t getTempBufSize(const SmallVector<DataContent::Ptr, 2>& baseContents) const override {
-        if (baseContents.empty()) {
-            return checked_cast<size_t>(_count) * sizeof(fp16_t);
-        } else {
-            IE_ASSERT(baseContents.size() == 1);
-            IE_ASSERT(desc().totalDimSize() % _count == 0);
-
-            return checked_cast<size_t>(desc().totalDimSize()) * sizeof(fp16_t);
-        }
-    }
-
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(ReplicatedContent);
-
-        auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
-        if (baseContents.empty()) {
-            std::fill_n(dstPtr, _count, ie::PrecisionUtils::f32tof16(_factor));
-        } else {
-            IE_ASSERT(baseContents.size() == 1);
-            IE_ASSERT(desc().totalDimSize() % _count == 0);
-
-            auto origCount = desc().totalDimSize() / _count;
-            auto origPtr = baseContents[0]->get<fp16_t>();
-            IE_ASSERT(origPtr != nullptr);
-
-            ie::parallel_for(_count, [origPtr, origCount, dstPtr](int i) {
-                std::copy_n(origPtr, origCount, dstPtr + i * origCount);
-            });
-        }
-    }
-
-private:
-    float _factor = 1.0f;
-    int _count = 0;
-};
-
-}  // namespace
-
-DataContent::Ptr replicateContent(float val, int count) {
-    return std::make_shared<ReplicatedContent>(val, count);
-}
-
-DataContent::Ptr replicateContent(const DataContent::Ptr& origContent, int count) {
-    return std::make_shared<ReplicatedContent>(origContent, count);
-}
-
-namespace {
-
-class ScaledContent final : public CalculatedDataContent {
-public:
-    ScaledContent(const DataContent::Ptr& origContent, float scale) :
-        CalculatedDataContent({origContent}), _factor(scale) {
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(ScaledContent);
-
-        IE_ASSERT(baseContents.size() == 1);
-
-        auto totalSize = desc().totalDimSize();
-
-        auto origDesc = baseContents[0]->desc();
-        IE_ASSERT(origDesc.type() == DataType::FP16);
-        IE_ASSERT(origDesc.totalDimSize() == totalSize);
-
-        auto srcPtr = baseContents[0]->get<fp16_t>();
-        IE_ASSERT(srcPtr != nullptr);
-
-        auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
-        ie::parallel_for(totalSize, [this, srcPtr, dstPtr](int i) {
-            dstPtr[i] = ie::PrecisionUtils::f32tof16(ie::PrecisionUtils::f16tof32(srcPtr[i]) * _factor);
-        });
-    }
-
-private:
-    float _factor = 1.0f;
-};
-
-}  // namespace
-
-DataContent::Ptr scaleContent(const DataContent::Ptr& origContent, float scale) {
-    return std::make_shared<ScaledContent>(origContent, scale);
-}
-
-namespace {
-
-class ScaledChannelContent final : public CalculatedDataContent {
-public:
-    ScaledChannelContent(
-            const DataContent::Ptr& origContent,
-            const DataContent::Ptr& scaleContent) :
-            CalculatedDataContent({origContent, scaleContent}) {
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(ScaledChannelContent);
-
-        IE_ASSERT(baseContents.size() == 2);
-
-        auto totalSize = desc().totalDimSize();
-
-        IE_ASSERT(desc().numDims() == 4 && desc().dimsOrder() == DimsOrder::NCHW);
-        auto numN = desc().dim(Dim::N);
-        auto numC = desc().dim(Dim::C);
-        auto numH = desc().dim(Dim::H);
-        auto numW = desc().dim(Dim::W);
-
-        auto origDesc = baseContents[0]->desc();
-        IE_ASSERT(origDesc.type() == DataType::FP16);
-        IE_ASSERT(origDesc.totalDimSize() == totalSize);
-        IE_ASSERT(baseContents[1]->desc().totalDimSize() == numN);
-
-        auto srcPtr = baseContents[0]->get<fp16_t>();
-        IE_ASSERT(srcPtr != nullptr);
-
-        auto scale = baseContents[1]->get<fp16_t>();
-        IE_ASSERT(scale != nullptr);
-
-        auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
-        for (int n = 0; n < numN; n++) {
-            for (int c = 0; c < numC; c++) {
-               for (int h = 0; h < numH; h++) {
-                   for (int w = 0; w < numW; w++) {
-                       dstPtr[n * numC * numH * numW + c * numH * numW + h * numW + w] =
-                               srcPtr[n * numC * numH * numW + c * numH * numW + h * numW + w] * scale[n];
-                   }
-               }
-            }
-        }
-    }
-};
-
-}  // namespace
-
-DataContent::Ptr scaledChannelContent(
-        const DataContent::Ptr& origContent,
-        const DataContent::Ptr& scaleContent) {
-    return std::make_shared<ScaledChannelContent>(origContent, scaleContent);
-}
-
-//
 // DataNode
 //
 
@@ -380,8 +123,7 @@ void DataNode::updateRequiredStrides(const StridesRequirement& newReqs) {
 }
 
 void DataNode::clearAllocation() {
-    _location = DataLocation::None;
-    _memoryOffset = 0;
+    _dataLocation = defaultDataLocation;
     attrs().erase("ioBufferOffset");
 }
 
@@ -393,69 +135,64 @@ void DataNode::setMemReqs(MemoryType mem) {
     _memReqs = mem;
 }
 
-void DataNode::setIOInfo(DataLocation location, int ioBufferOffset) {
-    IE_ASSERT(_usage == DataUsage::Input || _usage == DataUsage::Output);
+void DataNode::setIOInfo(Location location, int ioBufferOffset) {
+    VPU_INTERNAL_CHECK(_usage == DataUsage::Input || _usage == DataUsage::Output,
+        "Data {} failed: setIOInfo called for non IO data, actual usage is {}",
+        name(), usage());
 
     if (_usage == DataUsage::Input) {
-        IE_ASSERT(location == DataLocation::Input);
+        VPU_INTERNAL_CHECK(location == Location::Input,
+            "Input data {} failed: setIOInfo called with non input location, actual location is {}",
+            name(), location);
     } else if (_usage == DataUsage::Output) {
-        IE_ASSERT(location == DataLocation::Output);
+        VPU_INTERNAL_CHECK(location == Location::Output,
+            "Output data {} failed: setIOInfo called with non output location, actual location is {}",
+            name(), location);
     }
 
-    _location = location;
-    _memoryOffset = 0;
+    _dataLocation = {location, 0};
     attrs().set<int>("ioBufferOffset", ioBufferOffset);
 }
 
-void DataNode::setAllocationInfo(DataLocation location, int memoryOffset) {
-    IE_ASSERT(_usage == DataUsage::Const || _usage == DataUsage::Intermediate || _usage == DataUsage::Temp);
+void DataNode::setDataAllocationInfo(const DataLocation& dataLocation) {
+    VPU_INTERNAL_CHECK(_usage == DataUsage::Const || _usage == DataUsage::Intermediate || _usage == DataUsage::Temp,
+        "Data {} failed: setDataAllocationInfo called for data with incorrect usage, actual usage: {} "
+        "valid usages: {}, {}, {}", name(), usage(), DataUsage::Const, DataUsage::Intermediate, DataUsage::Temp);
 
     if (_usage == DataUsage::Const) {
-        IE_ASSERT(location == DataLocation::Blob);
+        VPU_INTERNAL_CHECK(dataLocation.location == Location::Blob,
+            "Const data {} failed: setDataAllocationInfo called with non blob location, actual location is {}",
+            name(), dataLocation.location);
     } else if (_usage == DataUsage::Temp) {
-        IE_ASSERT(location == DataLocation::BSS);
+        VPU_INTERNAL_CHECK(dataLocation.location == Location::BSS,
+            "Temp data {} failed: setDataAllocationInfo called with non bss location, actual location is {}",
+            name(), dataLocation.location);
     }
 
-    _location = location;
-    _memoryOffset = memoryOffset;
+    _dataLocation = dataLocation;
 }
 
-void DataNode::serializeBuffer(
-        BlobSerializer& serializer,
-        DimsOrder newOrder) {
-    if (newOrder.numDims() == 0) {
-        serializeBufferImpl(serializer, _desc, this->strides());
-    } else {
-        IE_ASSERT(newOrder.numDims() >= _desc.dimsOrder().numDims());
-
-        auto newDims = _desc.dims();
-        auto newStrides = this->strides();
-        auto newPerm = newOrder.toPermutation();
+void DataNode::setShapeAllocationInfo(const ShapeLocation& shapeLocation) {
+    _shapeLocation = shapeLocation;
+}
 
-        auto origOrder = _desc.dimsOrder();
-        auto origPerm = origOrder.toPermutation();
+void DataNode::serializeBuffer(
+        BlobSerializer& serializer) {
+    serializeDescImpl(serializer, _desc, this->strides());
 
-        size_t origPermInd = 0;
-        for (size_t i = 0; i < newPerm.size(); i++) {
-            auto d = newPerm[i];
+    serializer.append(checked_cast<uint32_t>(_dataLocation.location));
 
-            if (origPermInd < origPerm.size() && origPerm[origPermInd] == d) {
-                ++origPermInd;
-                continue;
-            }
+    if (_dataLocation.location == Location::Input || _dataLocation.location == Location::Output) {
+        auto topParent = getTopParentData();
 
-            newDims.set(d, 1);
-            if (i == 0) {
-                newStrides.set(d, _desc.elemSize());
-            } else {
-                newStrides.set(d, newStrides[newPerm[i - 1]] * newDims[newPerm[i - 1]]);
-            }
-        }
-        IE_ASSERT(origPermInd == origPerm.size());
+        auto ioIdx = topParent->attrs().get<int>("ioIdx");
+        serializer.append(checked_cast<uint32_t>(ioIdx));
 
-        DataDesc newDesc(_desc.type(), newOrder, newDims);
-        serializeBufferImpl(serializer, newDesc, newStrides);
+        auto parentByteSize = topParent->totalByteSize();
+        serializer.append(checked_cast<uint32_t>(parentByteSize));
     }
+
+    serializer.append(checked_cast<uint32_t>(_dataLocation.offset));
 }
 
 void DataNode::serializeIOInfo(BlobSerializer& serializer) const {
@@ -485,8 +222,6 @@ void DataNode::serializeDescImpl(
         const DimValues& storedStrides) const {
     IE_ASSERT(storedDesc.numDims() <= MAX_DIMS_32);
 
-    const auto& storedDims = storedDesc.dims();
-
     auto storedDimsOrder = storedDesc.dimsOrder();
 
     auto storedPerm = storedDimsOrder.toPermutation();
@@ -496,33 +231,13 @@ void DataNode::serializeDescImpl(
     serializer.append(checked_cast<uint32_t>(storedDimsOrder.code()));
 
     serializer.append(checked_cast<uint32_t>(storedPerm.size()));
-    for (auto d : storedPerm) {
-        serializer.append(checked_cast<uint32_t>(storedDims[d]));
-    }
-    for (auto d : storedPerm) {
-        serializer.append(checked_cast<uint32_t>(storedStrides[d]));
-    }
-}
-
-void DataNode::serializeBufferImpl(
-        BlobSerializer& serializer,
-        const DataDesc& storedDesc,
-        const DimValues& storedStrides) const {
-    serializeDescImpl(serializer, storedDesc, storedStrides);
 
-    serializer.append(checked_cast<uint32_t>(_location));
-
-    if (_location == DataLocation::Input || _location == DataLocation::Output) {
-        auto topParent = getTopParentData();
-
-        auto ioIdx = topParent->attrs().get<int>("ioIdx");
-        serializer.append(checked_cast<uint32_t>(ioIdx));
-
-        auto parentByteSize = topParent->totalByteSize();
-        serializer.append(checked_cast<uint32_t>(parentByteSize));
-    }
+    const auto& shape = shapeLocation();
 
-    serializer.append(checked_cast<uint32_t>(_memoryOffset));
+    serializer.append(checked_cast<uint32_t>(shape.dimsLocation));
+    serializer.append(checked_cast<uint32_t>(shape.dimsOffset));
+    serializer.append(checked_cast<uint32_t>(shape.stridesLocation));
+    serializer.append(checked_cast<uint32_t>(shape.stridesOffset));
 }
 
 void printTo(std::ostream& os, const Data& data) {
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/batch_norm_contents.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/batch_norm_contents.cpp
new file mode 100644 (file)
index 0000000..eb5eebb
--- /dev/null
@@ -0,0 +1,70 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/batch_norm_contents.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
+namespace vpu {
+
+namespace ie = InferenceEngine;
+
+//
+// BatchNormalizationWeightsContent
+//
+
+BatchNormalizationWeightsContent::BatchNormalizationWeightsContent(const DataContent::Ptr& origContent,
+                                                                   float epsilon) :
+        _origContent(origContent), _epsilon(epsilon) {}
+
+size_t BatchNormalizationWeightsContent::byteSize() const {
+    return _origContent->byteSize();
+}
+
+void BatchNormalizationWeightsContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(BatchNormalizationWeightsContent);
+
+    auto srcPtr = _origContent->get<fp16_t>();
+    auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+    ie::parallel_for(_origContent->byteSize() / sizeof(fp16_t), [this, srcPtr, dstPtr](int i) {
+        float val = ie::PrecisionUtils::f16tof32(srcPtr[i]) + _epsilon;
+        val = 1.0f / std::sqrt(val);
+        dstPtr[i] = ie::PrecisionUtils::f32tof16(val);
+    });
+}
+
+//
+// BatchNormalizationBiasesContent
+//
+
+BatchNormalizationBiasesContent::BatchNormalizationBiasesContent(const DataContent::Ptr& origContent,
+                                                                 const DataContent::Ptr& weightsContent) :
+        _origContent(origContent), _weightsContent(weightsContent) {}
+
+size_t BatchNormalizationBiasesContent::byteSize() const {
+    return _origContent->byteSize();
+}
+
+void BatchNormalizationBiasesContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(BatchNormalizationBiasesContent);
+
+    auto origPtr = _origContent->get<fp16_t>();
+    auto weightsPtr = _weightsContent->get<fp16_t>();
+
+    auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+    ie::parallel_for(_origContent->byteSize() / sizeof(fp16_t), [origPtr, weightsPtr, dstPtr](int i) {
+        // TODO : need to be extracted from IE layer.
+        float beta = 0.0f;
+
+        auto wVal = ie::PrecisionUtils::f16tof32(weightsPtr[i]);
+        dstPtr[i] = ie::PrecisionUtils::f32tof16(beta - wVal * ie::PrecisionUtils::f16tof32(origPtr[i]));
+    });
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/calculated_data_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/calculated_data_content.cpp
new file mode 100644 (file)
index 0000000..647b22f
--- /dev/null
@@ -0,0 +1,17 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/calculated_data_content.hpp>
+
+namespace vpu {
+
+const void* CalculatedDataContent::getRaw() const {
+    if (_temp.empty()) {
+        _temp.resize(byteSize());
+        fillTempBuf(_temp.data());
+    }
+    return _temp.data();
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/conv_weights_contents.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/conv_weights_contents.cpp
new file mode 100644 (file)
index 0000000..6c0eebc
--- /dev/null
@@ -0,0 +1,65 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/conv_weights_contents.hpp>
+
+#include <vpu/middleend/sw/utility.hpp>
+#include <vpu/utils/profiling.hpp>
+
+namespace vpu {
+
+//
+// ConvIm2ColWeightsContent
+//
+
+ConvIm2ColWeightsContent::ConvIm2ColWeightsContent(const DataContent::Ptr& origContent, DataDesc desc) :
+        _origContent(origContent), _desc(desc) {}
+
+size_t ConvIm2ColWeightsContent::byteSize() const {
+    return checked_cast<size_t>(_desc.totalDimSize()) *
+           checked_cast<size_t>(_desc.elemSize());
+}
+
+void ConvIm2ColWeightsContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(ConvIm2ColWeightsContent);
+    kchw_to_khwc(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+//
+// Conv3x3WeightsContent
+//
+
+Conv3x3WeightsContent::Conv3x3WeightsContent(const DataContent::Ptr& origContent, DataDesc desc) :
+        _origContent(origContent), _desc(desc) {
+}
+
+size_t Conv3x3WeightsContent::byteSize() const {
+    return checked_cast<size_t>(_desc.totalDimSize()) *
+           checked_cast<size_t>(_desc.elemSize());
+}
+
+void Conv3x3WeightsContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(Conv3x3WeightsContent);
+    kchw_to_hwkc(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+//
+// ConvCHWWeightsContent
+//
+
+ConvCHWWeightsContent::ConvCHWWeightsContent(const DataContent::Ptr& origContent, DataDesc desc) :
+        _origContent(origContent), _desc(desc) {
+}
+
+size_t ConvCHWWeightsContent::byteSize() const {
+    return checked_cast<size_t>(_desc.totalDimSize()) *
+           checked_cast<size_t>(_desc.elemSize());
+}
+
+void ConvCHWWeightsContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(ConvCHWWeightsContent);
+    kchw_to_hwkc(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/data_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/data_content.cpp
new file mode 100644 (file)
index 0000000..4fbe41a
--- /dev/null
@@ -0,0 +1,11 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/data_content.hpp>
+
+namespace vpu {
+
+DataContent::~DataContent() = default;
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/deconvolution_contents.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/deconvolution_contents.cpp
new file mode 100644 (file)
index 0000000..5e04f32
--- /dev/null
@@ -0,0 +1,174 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/deconvolution_contents.hpp>
+
+#include <vpu/utils/profiling.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+//
+// DeconvolutionToConvolutionContent
+//
+
+DeconvolutionToConvolutionContent::DeconvolutionToConvolutionContent(
+        const DataContent::Ptr& origContent, const DataDesc& desc) :
+        _origContent(origContent), _desc(desc) {
+}
+
+size_t DeconvolutionToConvolutionContent::byteSize() const {
+    return checked_cast<size_t>(_desc.totalDimSize()) *
+           checked_cast<size_t>(_desc.elemSize());
+}
+
+void DeconvolutionToConvolutionContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(DeconvolutionToConvolutionContent);
+
+    IE_ASSERT(_desc.type() == DataType::FP16);
+
+    deconv_to_conv(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+//
+// DepthDeconvolutionCHWWeightsContent
+//
+
+void depthDeconvolutionRelayoutCHW(
+        const fp16_t* src, int src_size,
+        fp16_t* dst, int dst_size,
+        int KX, int KY,
+        int channels) {
+    ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
+        int iidx = c * KX * KY + ky * KX + kx;
+        IE_ASSERT(iidx >= 0 && iidx < src_size);
+
+        int inv_kx = KX - kx - 1;
+        int inv_ky = KY - ky - 1;
+        int oidx = c * KX * KY + inv_ky * KX + inv_kx;
+        IE_ASSERT(oidx >= 0 && oidx < dst_size);
+
+        dst[oidx] = src[iidx];
+    });
+}
+
+DepthDeconvolutionCHWWeightsContent::DepthDeconvolutionCHWWeightsContent(
+        const DataContent::Ptr& origContent,
+        int KX, int KY, int channels) :
+        _origContent(origContent),
+        _KX(KX), _KY(KY), _channels(channels) {}
+
+void DepthDeconvolutionCHWWeightsContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(DepthDeconvolutionCHWWeightsContent);
+    depthDeconvolutionRelayoutCHW(
+            _origContent->get<fp16_t>(), _origContent->byteSize() / sizeof(fp16_t),
+            static_cast<fp16_t*>(tempBuf), _origContent->byteSize() / sizeof(fp16_t),
+            _KX, _KY, _channels);
+}
+
+size_t DepthDeconvolutionCHWWeightsContent::byteSize() const {
+    return _origContent->byteSize();
+}
+
+//
+// DepthDeconvolutionHWCWeightsContent
+//
+
+void depthDeconvolutionRelayoutHWC(
+        const fp16_t* src, int src_size,
+        fp16_t* dst, int dst_size,
+        int KX, int KY,
+        int channels) {
+    ie::parallel_for3d(channels, KY, KX, [=](int c, int ky, int kx) {
+        int iidx = c * KX * KY + ky * KX + kx;
+        IE_ASSERT(iidx < src_size);
+
+        int inv_kx = KX - kx - 1;
+        int inv_ky = KY - ky - 1;
+        int oidx = inv_ky * KX * channels + inv_kx * channels + c;
+        IE_ASSERT(oidx < dst_size);
+
+        dst[oidx] = src[iidx];
+    });
+}
+
+DepthDeconvolutionHWCWeightsContent::DepthDeconvolutionHWCWeightsContent(
+        const DataContent::Ptr& origContent,
+        int KX, int KY, int channels) :
+        _origContent(origContent),
+        _KX(KX), _KY(KY), _channels(channels) {
+}
+
+void DepthDeconvolutionHWCWeightsContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(DepthDeconvolutionHWCWeightsContent);
+    depthDeconvolutionRelayoutHWC(
+            _origContent->get<fp16_t>(), _origContent->byteSize() / sizeof(fp16_t),
+            static_cast<fp16_t*>(tempBuf), _origContent->byteSize() / sizeof(fp16_t),
+            _KX, _KY, _channels);
+}
+
+size_t DepthDeconvolutionHWCWeightsContent::byteSize() const {
+    return _origContent->byteSize();
+}
+
+//
+// DeconvolutionWeightsContent
+//
+
+void deconvolutionRelayout(
+        const fp16_t* src, int src_size,
+        fp16_t* dst, int dst_size,
+        int KX, int KY,
+        int IC, int OC) {
+    ie::parallel_for4d(OC, IC, KY, KX, [=](int oc, int ic, int ky, int kx) {
+        int iidx = ic * OC * KY * KX
+                   + oc * KY * KX
+                   + ky * KX
+                   + kx;
+        IE_ASSERT(iidx >= 0 && iidx < src_size);
+
+        int inv_kx = KX - kx - 1;
+        int inv_ky = KY - ky - 1;
+        int oidx = oc * IC * KY * KX
+                   + ic * KY * KX
+                   + inv_ky * KX
+                   + inv_kx;
+        IE_ASSERT(oidx >=  0 && oidx < dst_size);
+
+        dst[oidx] = src[iidx];
+    });
+}
+
+DeconvolutionWeightsContent::DeconvolutionWeightsContent(
+        const DataContent::Ptr& origContent,
+        DataDesc desc,
+        int KX, int KY,
+        int IC, int OC) :
+        _origContent(origContent), _desc(desc),
+        _intermBuf(_desc.totalDimSize()),
+        _KX(KX), _KY(KY),
+        _IC(IC), _OC(OC) {
+}
+
+size_t DeconvolutionWeightsContent::byteSize() const {
+    return _desc.totalDimSize() * sizeof(fp16_t);
+}
+
+void DeconvolutionWeightsContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(DeconvolutionWeightsContent);
+
+    auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+    deconvolutionRelayout(
+            _origContent->get<fp16_t>(), _desc.totalDimSize(),
+            _intermBuf.data(), _desc.totalDimSize(),
+            _KX, _KY,
+            _IC, _OC);
+
+    kchw_to_hwkc(_intermBuf.data(), dstPtr, _desc);
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/default_sw_weights_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/default_sw_weights_content.cpp
new file mode 100644 (file)
index 0000000..4e43c32
--- /dev/null
@@ -0,0 +1,29 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/default_sw_weights_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+
+namespace vpu {
+
+DefaultSwWeightsContent::DefaultSwWeightsContent(const DataContent::Ptr& origContent, const DataDesc& desc) :
+        _origContent(origContent), _desc(desc) {
+}
+
+size_t DefaultSwWeightsContent::byteSize() const {
+    return checked_cast<size_t>(_desc.totalDimSize()) *
+           checked_cast<size_t>(_desc.elemSize());
+}
+
+void DefaultSwWeightsContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(DefaultSwWeightsContent);
+
+    IE_ASSERT(_desc.type() == DataType::FP16);
+
+    kchw_to_hwck(_origContent->get<fp16_t>(), static_cast<fp16_t*>(tempBuf), _desc);
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/hw_const_data_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/hw_const_data_content.cpp
new file mode 100644 (file)
index 0000000..b3c393a
--- /dev/null
@@ -0,0 +1,101 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/hw_const_data_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+HwConstData::HwConstData(
+        const DataContent::Ptr& origContent,
+        const DataDesc& origDesc,
+        const DataDesc& resDesc,
+        const std::map<Dim, Slice> dimSlices) :
+        _origContent(origContent),
+        _origDesc(origDesc),
+        _resDesc(resDesc),
+        _dimSlices(dimSlices) {}
+
+size_t HwConstData::byteSize() const {
+    return checked_cast<size_t>(_resDesc.totalDimSize()) *
+           checked_cast<size_t>(_resDesc.elemSize());
+}
+
+void HwConstData::fillTempBuf(void* outBuf) const {
+    VPU_PROFILE(HwConstData);
+
+    VPU_THROW_UNLESS(
+        _resDesc.type() == DataType::FP16,
+        "Constant data has {} data type while only {} is supported",
+        _resDesc.type(), DataType::FP16);
+
+    const auto srcData = _origContent->get<fp16_t>();
+    auto dstData = static_cast<fp16_t*>(outBuf);
+
+    VPU_THROW_UNLESS(srcData != nullptr,
+        "Source buffer for constant data has null address");
+
+    auto getDimSlice = [this](const Dim dim) {
+        auto it = _dimSlices.find(dim);
+        if (it != _dimSlices.end()) {
+            return it->second;
+        }
+
+        const int startInd = 0;
+        const size_t size = _origDesc.dim(dim);
+
+        return Slice(startInd, size);
+    };
+
+    if (_origDesc.numDims() == 4) {
+        Slice slice = getDimSlice(Dim::N);
+
+        int startOC = slice.start;
+        size_t numOC = slice.size;
+
+        const auto IC = _origDesc.dim(Dim::C);
+        const auto K = _origDesc.dim(Dim::H);
+        const auto V = _origDesc.dim(Dim::W);
+
+        const auto kernelStride     = V;
+        const auto inChannelStride  = K * kernelStride;
+        const auto outerStride      = IC * inChannelStride;
+
+        ie::parallel_for(numOC, [=](int oc) {
+            const auto ocSlice = oc;
+            oc += startOC;
+
+            const auto ocInner = oc % V;
+            const auto ocOuter = oc / V;
+            const auto ocSliceInner = ocSlice % V;
+            const auto ocSliceOuter = ocSlice / V;
+
+            const auto ocSrc = ocInner + ocOuter * outerStride;
+            const auto ocDst = ocSliceInner + ocSliceOuter * outerStride;
+
+            for (int ic = 0; ic < IC; ++ic)
+                for (int k = 0; k < K; ++k) {
+                    const auto srcInd = ocSrc +
+                                        k * kernelStride +
+                                        ic * inChannelStride;
+                    const auto dstInd = ocDst +
+                                        k * kernelStride +
+                                        ic * inChannelStride;
+
+                    dstData[dstInd] = srcData[srcInd];
+                }
+        });
+    } else if (_origDesc.numDims() == 1) {
+        Slice slice = getDimSlice(Dim::C);
+
+        std::copy(srcData + slice.start, srcData + slice.start + slice.size, dstData);
+    } else {
+        THROW_IE_EXCEPTION << "Invalid number of dimensions " << _origDesc.numDims();
+    }
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/hw_weights_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/hw_weights_content.cpp
new file mode 100644 (file)
index 0000000..1d5e33c
--- /dev/null
@@ -0,0 +1,104 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/hw_weights_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+HwWeightsContent::HwWeightsContent(const DataContent::Ptr& origContent,
+                                   const DataDesc& origWeightsDesc,
+                                   const DataDesc& resDesc,
+                                   int numInputChannels,
+                                   int channelStartIndex) :
+        _origContent(origContent),
+        _origDesc(origWeightsDesc),
+        _resDesc(resDesc),
+        _numInputChannels(numInputChannels),
+        _channelStartIndex(channelStartIndex) {
+}
+
+size_t HwWeightsContent::byteSize() const {
+    return checked_cast<size_t>(_resDesc.totalDimSize()) *
+           checked_cast<size_t>(_resDesc.elemSize());
+}
+
+void HwWeightsContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(HwWeightsContent);
+
+    IE_ASSERT(_resDesc.type() == DataType::FP16);
+
+    const auto KX = _origDesc.dim(Dim::W);
+    const auto KY = _origDesc.dim(Dim::H);
+    const auto IC = _origDesc.dim(Dim::C);
+    const auto OC = _origDesc.dim(Dim::N);
+    const auto origTotalSize = _origDesc.totalDimSize();
+
+    const auto HW_OC_inner = _resDesc.dim(Dim::W);
+    const auto HW_OC_outer = _resDesc.dim(Dim::N);
+    IE_ASSERT(HW_OC_outer * HW_OC_inner >= OC);
+
+    const auto HW_K = _resDesc.dim(Dim::H);
+    IE_ASSERT(HW_K == KX * KY);
+
+    IE_ASSERT(_channelStartIndex < IC);
+    const auto HW_IC = _resDesc.dim(Dim::C);
+    const auto HW_IC_real = std::min(_numInputChannels, IC - _channelStartIndex);
+
+    const auto srcData = _origContent->get<fp16_t>();
+    IE_ASSERT(srcData != nullptr);
+
+    auto dstData = static_cast<fp16_t*>(tempBuf);
+
+    IE_ASSERT((_channelStartIndex + HW_IC_real) * HW_K + (OC - 1) * HW_K * IC - 1 < origTotalSize);
+    IE_ASSERT((OC - 1) % HW_OC_inner +
+              (HW_K - 1) * HW_OC_inner +
+              (HW_IC_real - 1) * HW_OC_inner * HW_K +
+              ((OC - 1) / 8) * HW_OC_inner * HW_K * HW_IC < _resDesc.totalDimSize());
+
+    if (KX == 1 && KY == 1) {
+        ie::parallel_for(OC, [=](int oc) {
+            const auto oc_inner = oc % HW_OC_inner;
+            const auto oc_outer = oc / HW_OC_inner;
+            for (int ic = 0; ic < HW_IC_real; ++ic) {
+                const auto srcInd =
+                        (_channelStartIndex + ic) +
+                        oc * IC;
+                const auto dstInd =
+                        oc_inner +
+                        ic * HW_OC_inner * HW_K +
+                        oc_outer * HW_OC_inner * HW_K * HW_IC;
+
+                dstData[dstInd] = srcData[srcInd];
+            }
+        });
+    } else {
+        ie::parallel_for(OC, [=](int oc) {
+            const auto oc_inner = oc % HW_OC_inner;
+            const auto oc_outer = oc / HW_OC_inner;
+            for (int ic = 0; ic < HW_IC_real; ++ic) {
+                for (int ky = 0; ky < KY; ++ky) {
+                    for (int kx = 0; kx < KX; ++kx) {
+                        const auto srcInd =
+                                (kx + ky * KX) +
+                                (_channelStartIndex + ic) * HW_K +
+                                oc * HW_K * IC;
+                        const auto dstInd =
+                                oc_inner +
+                                (ky * KX + kx) * HW_OC_inner +
+                                ic * HW_OC_inner * HW_K +
+                                oc_outer * HW_OC_inner * HW_K * HW_IC;
+
+                        dstData[dstInd] = srcData[srcInd];
+                    }
+                }
+            }
+        });
+    }
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/ie_blob_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/ie_blob_content.cpp
new file mode 100644 (file)
index 0000000..4f61b98
--- /dev/null
@@ -0,0 +1,39 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
+#include <vpu/utils/ie_helpers.hpp>
+
+namespace vpu {
+
+IeBlobContent::IeBlobContent(const ie::Blob::CPtr& blob, DataType resultDataType) : _blob(blob), _resultDataType(resultDataType) {
+    VPU_THROW_UNLESS(_resultDataType == DataType::FP16 || _resultDataType == DataType::S32,
+                     "IeBlobContent creation error: {} result type is unsupported, only {} and {} are supported",
+                     _resultDataType, DataType::FP16, DataType::S32);
+}
+
+size_t IeBlobContent::byteSize() const {
+    // Result can be converted into type with another size
+    const auto elementSize = _resultDataType == DataType::FP16 ? sizeof(fp16_t) : sizeof(int32_t);
+    return elementSize * _blob->size();
+}
+
+const void* IeBlobContent::getRaw() const {
+    if (_resultDataType == DataType::FP16) {
+        if (_blobFp16 == nullptr) {
+            _blobFp16 = _blob->getTensorDesc().getPrecision() == ie::Precision::FP16 ?
+                        _blob : convertBlobFP32toFP16(_blob);
+        }
+        return _blobFp16->cbuffer();
+    } else { // S32
+        return _blob->cbuffer();
+    }
+}
+
+DataContent::Ptr ieBlobContent(const ie::Blob::CPtr& blob, DataType resultDataType) {
+    return std::make_shared<IeBlobContent>(blob, resultDataType);
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/kernel_binary_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/kernel_binary_content.cpp
new file mode 100644 (file)
index 0000000..be35e4f
--- /dev/null
@@ -0,0 +1,23 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/kernel_binary_content.hpp>
+
+#include <string>
+
+namespace vpu {
+
+KernelBinaryContent::KernelBinaryContent(const std::string& blob) : _blob(blob) {
+    IE_ASSERT(!_blob.empty());
+}
+
+size_t KernelBinaryContent::byteSize() const {
+    return _blob.size();
+}
+
+const void* KernelBinaryContent::getRaw() const {
+    return _blob.data();
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/mean_contents.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/mean_contents.cpp
new file mode 100644 (file)
index 0000000..0d09472
--- /dev/null
@@ -0,0 +1,80 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/mean_contents.hpp>
+
+#include <vpu/utils/profiling.hpp>
+#include <vpu/middleend/sw/utility.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
+namespace vpu {
+
+//
+// MeanImageContent
+//
+
+MeanImageContent::MeanImageContent(const ie::PreProcessInfo& info, const DataDesc& desc) : _info(info), _desc(desc) {}
+
+size_t MeanImageContent::byteSize() const {
+    size_t countElem = checked_cast<size_t>(_desc.dim(Dim::W) * _desc.dim(Dim::H) * _desc.dim(Dim::C));
+    if (_desc.dimsOrder() == DimsOrder::NHWC || _desc.dimsOrder() == DimsOrder::HWC) {
+        countElem *= 2;
+    }
+
+    return countElem * sizeof(fp16_t);
+}
+
+void MeanImageContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(MeanImageContent);
+
+    const size_t numOfChannel = _info.getNumberOfChannels();
+
+    const size_t imagePixels = checked_cast<size_t>(_desc.dim(Dim::W) * _desc.dim(Dim::H));
+    const size_t countElem = checked_cast<size_t>(_desc.dim(Dim::W) * _desc.dim(Dim::H) * _desc.dim(Dim::C));
+
+    const auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+    auto dstPtr2 = dstPtr;
+    if (_desc.dimsOrder() == DimsOrder::NHWC || _desc.dimsOrder() == DimsOrder::HWC) {
+        dstPtr2 += countElem;
+    }
+
+    ie::parallel_for(numOfChannel, [=](size_t i) {
+        const auto meanDataBlob = _info[i]->meanData;
+
+        ie::PrecisionUtils::f32tof16Arrays(
+                dstPtr2 + i * imagePixels,
+                meanDataBlob->buffer().as<const float*>(),
+                imagePixels,
+                -1.0f);
+    });
+
+    if (_desc.dimsOrder() == DimsOrder::NHWC || _desc.dimsOrder() == DimsOrder::HWC) {
+        kchw_to_hwck(dstPtr2, dstPtr, _desc);
+    }
+}
+
+//
+// MeanValueContent
+//
+
+MeanValueContent::MeanValueContent(const ie::PreProcessInfo& info) : _info(info) {}
+
+size_t MeanValueContent::byteSize() const {
+    return _info.getNumberOfChannels() * sizeof(fp16_t);
+}
+
+void MeanValueContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(MeanValueContent);
+
+    const auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+    ie::parallel_for(_info.getNumberOfChannels(), [dstPtr, this](size_t i) {
+        dstPtr[i] = ie::PrecisionUtils::f32tof16(-_info[i]->meanValue);
+    });
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/merge_fc_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/merge_fc_content.cpp
new file mode 100644 (file)
index 0000000..808aee6
--- /dev/null
@@ -0,0 +1,60 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/merge_fc_content.hpp>
+
+#include <ie_parallel.hpp>
+
+#include <numeric>
+
+namespace vpu {
+
+MergeFullyConnectedContentsByChannels::MergeFullyConnectedContentsByChannels(const std::vector<DataContent::CPtr> contents,
+                                                                             const std::vector<DataDesc> inDescs,
+                                                                             const DataDesc& resDesc) :
+        _contents(contents), _inDescs(inDescs), _resDesc(resDesc) {}
+
+size_t MergeFullyConnectedContentsByChannels::byteSize() const {
+    return checked_cast<size_t>(_resDesc.totalDimSize()) *
+           checked_cast<size_t>(_resDesc.elemSize());
+}
+
+void MergeFullyConnectedContentsByChannels::fillTempBuf(void* temp) const {
+    IE_ASSERT(!_contents.empty());
+    // vpu::DataNode has content and vpu::DataDesc with dimensions' vector
+    // content has dimensions's vector as well
+    // they can be different so we extract channels number from contents
+    const auto dstC = std::accumulate(_inDescs.begin(), _inDescs.end(), 0, [](int reduction, const DataDesc& desc) {
+        return reduction + desc.dims()[Dim::C];});
+
+    for (std::size_t i = 0, dstChannelsOffset = 0; i < _inDescs.size(); ++i) {
+        const auto& content = _contents[i];
+        const auto& srcDesc = _inDescs[i];
+
+        const auto& srcDims = srcDesc.dims();
+        const auto& elemSize = srcDesc.elemSize();
+
+        const auto N = srcDims.get(Dim::N, 1);
+        const auto H = srcDims.get(Dim::H, 1);
+        const auto W = srcDims.get(Dim::W, 1) * elemSize;
+
+        const auto& srcC = srcDims[Dim::C];
+
+        const auto src = content->get<uint8_t>();
+        auto dst = static_cast<uint8_t*>(temp);
+
+        InferenceEngine::parallel_for4d(N, srcC, H, W, [dstChannelsOffset, N, H, W, src, dst, srcC, dstC](int n, int c, int h, int w) {
+            const auto& srcc = c;
+            const auto& dstc = dstChannelsOffset + c;
+
+            const auto& srcOffset = n * H * W * srcC + srcc * H * W + h * W + w;
+            const auto& dstOffset = n * H * W * dstC + dstc * H * W + h * W + w;
+            dst[dstOffset] = src[srcOffset];
+        });
+
+        dstChannelsOffset += srcC;
+    }
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/mtcnn_blob_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/mtcnn_blob_content.cpp
new file mode 100644 (file)
index 0000000..66558b3
--- /dev/null
@@ -0,0 +1,21 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/mtcnn_blob_content.hpp>
+
+namespace vpu {
+
+MTCNNBlobContent::MTCNNBlobContent(std::vector<char> blob) : _blob(std::move(blob)) {
+    IE_ASSERT(!_blob.empty());
+}
+
+size_t MTCNNBlobContent::byteSize() const {
+    return _blob.size();
+}
+
+const void* MTCNNBlobContent::getRaw() const {
+    return _blob.data();
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/prelu_blob_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/prelu_blob_content.cpp
new file mode 100644 (file)
index 0000000..fda51c7
--- /dev/null
@@ -0,0 +1,56 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/prelu_blob_content.hpp>
+
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+PReLUBlobContent::PReLUBlobContent(const ie::Blob::CPtr& blob, const DataDesc& desc, int repeat) :
+        _blob(blob), _desc(desc), _repeat(repeat) {
+    VPU_INTERNAL_CHECK(repeat >= 1,
+        "PReLUBlobContent only supports repeat value more than 1, actual is {}", repeat);
+}
+
+size_t PReLUBlobContent::byteSize() const {
+    return checked_cast<size_t>(_desc.totalDimSize()) *
+           checked_cast<size_t>(_desc.elemSize());
+}
+
+const void* PReLUBlobContent::getRaw() const {
+    if (_blobFp16 == nullptr) {
+        _blobFp16 = _blob->getTensorDesc().getPrecision() == ie::Precision::FP16 ?
+                    _blob : convertBlobFP32toFP16(_blob);
+    }
+
+    if (_repeat == 1) {
+        return _blobFp16->cbuffer();
+    }
+
+    if (_tempFp16.empty()) {
+        VPU_PROFILE(PReLUBlobContent);
+
+        IE_ASSERT(_desc.totalDimSize() % _repeat == 0);
+
+        auto origNumElems = _desc.totalDimSize() / _repeat;
+        IE_ASSERT(checked_cast<size_t>(origNumElems) <= _blobFp16->size());
+
+        auto origPtr = _blobFp16->cbuffer().as<const fp16_t*>();
+        IE_ASSERT(origPtr != nullptr);
+
+        _tempFp16.resize(checked_cast<size_t>(_desc.totalDimSize()));
+
+        ie::parallel_for(_repeat, [this, origPtr, origNumElems](int i) {
+            std::copy_n(origPtr, origNumElems, _tempFp16.data() + i * origNumElems);
+        });
+    }
+
+    return _tempFp16.data();
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/priorbox_contents.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/priorbox_contents.cpp
new file mode 100644 (file)
index 0000000..94f8162
--- /dev/null
@@ -0,0 +1,381 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/priorbox_contents.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <precision_utils.h>
+#include <ie_layers.h>
+#include <ie_parallel.hpp>
+
+namespace vpu {
+
+//
+// PriorBoxContent
+//
+
+PriorBoxContent::PriorBoxContent(
+        const DataDesc& inDesc0,
+        const DataDesc& inDesc1,
+        const DataDesc& outDesc,
+        const ie::CNNLayerPtr &layer) :
+        _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
+        _layer(layer) {
+    IE_ASSERT(layer != nullptr);
+}
+
+size_t PriorBoxContent::byteSize() const {
+    return checked_cast<size_t>(_outDesc.totalDimSize()) *
+           checked_cast<size_t>(_outDesc.elemSize());
+}
+
+void PriorBoxContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(PriorBoxContent);
+
+    auto tempPtr = static_cast<fp16_t*>(tempBuf);
+
+    auto _min_sizes = _layer->GetParamAsFloats("min_size", {});
+    auto _max_sizes = _layer->GetParamAsFloats("max_size", {});
+    auto aspect_ratios = _layer->GetParamAsFloats("aspect_ratio");
+    auto _flip = static_cast<bool>(_layer->GetParamAsInt("flip"));
+    auto _clip = static_cast<bool>(_layer->GetParamAsInt("clip"));
+    auto _variance = _layer->GetParamAsFloats("variance");
+    auto _img_h = _layer->GetParamAsInt("img_h", 0);
+    auto _img_w = _layer->GetParamAsInt("img_w", 0);
+    auto _step = _layer->GetParamAsFloat("step", 0);
+    auto _offset = _layer->GetParamAsFloat("offset", 0);
+    auto _scale_all_sizes = static_cast<bool>(_layer->GetParamAsInt("scale_all_sizes", 1));
+
+    auto _fixed_sizes = _layer->GetParamAsFloats("fixed_size", {});
+    auto _fixed_ratios = _layer->GetParamAsFloats("fixed_ratio", {});
+    auto _densitys = _layer->GetParamAsFloats("density", {});
+
+    SmallVector<float> _aspect_ratios;
+    _aspect_ratios.reserve(aspect_ratios.size() + 1);
+
+    _aspect_ratios.push_back(1.0f);
+    for (const auto& aspect_ratio : aspect_ratios) {
+        bool exist = false;
+
+        for (const auto& _aspect_ratio : _aspect_ratios) {
+            if (fabsf(aspect_ratio - _aspect_ratio) < 1e-6) {
+                exist = true;
+                break;
+            }
+        }
+        if (!exist) {
+            _aspect_ratios.push_back(aspect_ratio);
+            if (_flip) {
+                if (isFloatEqual(aspect_ratio, 0.f)) {
+                    THROW_IE_EXCEPTION << "[VPU] PriorBox has 0.0 aspect ratio param in flip mode, "
+                                       << " possible division by zero";
+                }
+                _aspect_ratios.push_back(1.0f / aspect_ratio);
+            }
+        }
+    }
+
+    int _num_priors;
+    if (_scale_all_sizes) {
+        _num_priors = static_cast<int>(_aspect_ratios.size() * _min_sizes.size());
+    } else {
+        _num_priors = static_cast<int>(_aspect_ratios.size() + _min_sizes.size() - 1);
+    }
+
+    if (!_fixed_sizes.empty()) {
+        _num_priors = static_cast<int>(_aspect_ratios.size() * _fixed_sizes.size());
+    }
+
+    if (!_densitys.empty()) {
+        for (const auto& _density : _densitys) {
+            if (!_fixed_ratios.empty()) {
+                _num_priors += _fixed_ratios.size() * (static_cast<int>(pow(_density, 2)) - 1);
+            } else {
+                _num_priors += _aspect_ratios.size() * (static_cast<int>(pow(_density, 2)) - 1);
+            }
+        }
+    }
+
+    _num_priors += _max_sizes.size();
+
+    auto W  = _inDesc0.dim(Dim::W);
+    auto H  = _inDesc0.dim(Dim::H);
+    auto IW = _img_w == 0 ? _inDesc1.dim(Dim::W) : _img_w;
+    auto IH = _img_h == 0 ? _inDesc1.dim(Dim::H) : _img_h;
+    auto IWI = 1.0f / static_cast<float>(IW);
+    auto IHI = 1.0f / static_cast<float>(IH);
+
+    auto OW = (_outDesc.numDims() >= 4) ? _outDesc.dim(Dim::N) : 1;
+    auto OH = _outDesc.dim(Dim::W);
+
+    float step_x = 0.0f;
+    float step_y = 0.0f;
+
+    if (_step == 0) {
+        step_x = static_cast<float>(IW) / W;
+        step_y = static_cast<float>(IH) / H;
+    } else {
+        step_x = _step;
+        step_y = _step;
+    }
+
+    auto dst_data = tempPtr;
+
+    int dim = H * W * _num_priors * 4;
+    float center_x = 0.0f;
+    float center_y = 0.0f;
+
+    float box_width = 0.0f;
+    float box_height = 0.0f;
+
+    if (_outDesc.dim(Dim::W) != dim || _outDesc.dim(Dim::H) != 2) {
+        THROW_IE_EXCEPTION << "[VPU] PriorBox output have invalid dimension, exptected " << dim << "x2"
+                           << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H)
+                           << ", layer name is: " << _layer->name;
+    }
+
+    auto max_fp16 = [](const float value, const float min) {
+        return ie::PrecisionUtils::f32tof16(value > min ? value : min);
+    };
+
+    auto min_fp16 = [](const float value, const float max) {
+        return ie::PrecisionUtils::f32tof16(value < max ? value : max);
+    };
+
+    size_t idx = 0;
+    for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W;  ++w) {
+            if (_step == 0) {
+                center_x = (static_cast<float>(w) + 0.5f) * step_x;
+                center_y = (static_cast<float>(h) + 0.5f) * step_y;
+            } else {
+                center_x = (_offset + static_cast<float>(w)) * _step;
+                center_y = (_offset + static_cast<float>(h)) * _step;
+            }
+
+            for (size_t s = 0; s < _fixed_sizes.size(); ++s) {
+                auto fixed_size_ = static_cast<size_t>(_fixed_sizes[s]);
+                box_width = box_height = fixed_size_ * 0.5f;
+
+                int density_ = 0;
+                int shift = 0;
+                if (s < _densitys.size()) {
+                    density_ = static_cast<size_t>(_densitys[s]);
+                    shift = static_cast<int>(_fixed_sizes[s] / density_);
+                }
+
+                if (!_fixed_ratios.empty()) {
+                    for (const auto& fr : _fixed_ratios) {
+                        const auto box_width_ratio = _fixed_sizes[s] * 0.5f * std::sqrt(fr);
+                        const auto box_height_ratio = _fixed_sizes[s] * 0.5f / std::sqrt(fr);
+
+                        for (size_t r = 0; r < density_; ++r) {
+                            for (size_t c = 0; c < density_; ++c) {
+                                const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
+                                const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
+
+                                dst_data[idx++] = max_fp16((center_x_temp - box_width_ratio) * IWI, 0.f);
+                                dst_data[idx++] = max_fp16((center_y_temp - box_height_ratio) * IHI, 0.f);
+                                dst_data[idx++] = min_fp16((center_x_temp + box_width_ratio) * IWI, 1.f);
+                                dst_data[idx++] = min_fp16((center_y_temp + box_height_ratio) * IHI, 1.f);
+                            }
+                        }
+                    }
+                } else {
+                    if (!_densitys.empty()) {
+                        for (int r = 0; r < density_; ++r) {
+                            for (int c = 0; c < density_; ++c) {
+                                const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
+                                const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
+
+                                dst_data[idx++] = max_fp16((center_x_temp - box_width) * IWI, 0.f);
+                                dst_data[idx++] = max_fp16((center_y_temp - box_height) * IHI, 0.f);
+                                dst_data[idx++] = min_fp16((center_x_temp + box_width) * IWI, 1.f);
+                                dst_data[idx++] = min_fp16((center_y_temp + box_height) * IHI, 1.f);
+                            }
+                        }
+                    }
+                    //  Rest of priors
+                    for (const auto& ar : _aspect_ratios) {
+                        if (fabs(ar - 1.) < 1e-6) {
+                            continue;
+                        }
+
+                        const auto box_width_ratio = _fixed_sizes[s] * 0.5f * std::sqrt(ar);
+                        const auto box_height_ratio = _fixed_sizes[s] * 0.5f / std::sqrt(ar);
+                        for (int r = 0; r < density_; ++r) {
+                            for (int c = 0; c < density_; ++c) {
+                                const auto center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift;
+                                const auto center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift;
+
+                                dst_data[idx++] = max_fp16((center_x_temp - box_width_ratio) * IWI, 0.f);
+                                dst_data[idx++] = max_fp16((center_y_temp - box_height_ratio) * IHI, 0.f);
+                                dst_data[idx++] = min_fp16((center_x_temp + box_width_ratio) * IWI, 1.f);
+                                dst_data[idx++] = min_fp16((center_y_temp + box_height_ratio) * IHI, 1.f);
+                            }
+                        }
+                    }
+                }
+            }
+
+            for (size_t msIdx = 0; msIdx < _min_sizes.size(); msIdx++) {
+                box_width = _min_sizes[msIdx];
+                box_height = _min_sizes[msIdx];
+
+                dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
+                dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
+                dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
+                dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
+
+                if (_max_sizes.size() > msIdx) {
+                    box_width = box_height = std::sqrt(_min_sizes[msIdx] * _max_sizes[msIdx]);
+
+                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
+                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
+                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
+                    dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
+                }
+
+                if (_scale_all_sizes || (!_scale_all_sizes && (msIdx == _min_sizes.size() - 1))) {
+                    size_t sIdx = _scale_all_sizes ? msIdx : 0;
+                    for (const auto& ar : _aspect_ratios) {
+                        if (std::fabs(ar - 1.0f) < 1e-6) {
+                            continue;
+                        }
+
+                        box_width = _min_sizes[sIdx] * std::sqrt(ar);
+                        box_height = _min_sizes[sIdx] / std::sqrt(ar);
+
+                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x - box_width / 2.0f) / IW);
+                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y - box_height / 2.0f) / IH);
+                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_x + box_width / 2.0f) / IW);
+                        dst_data[idx++] = ie::PrecisionUtils::f32tof16((center_y + box_height / 2.0f) / IH);
+                    }
+                }
+            }
+        }
+    }
+
+    if (_clip) {
+        for (int d = 0; d < dim; ++d) {
+            dst_data[d] = (std::min)((std::max)(dst_data[d], ie::PrecisionUtils::f32tof16(0.0f)), ie::PrecisionUtils::f32tof16(1.0f));
+        }
+    }
+
+    int channel_size = OH * OW;
+
+    dst_data += channel_size;
+
+    if (_variance.size() == 1) {
+        ie::parallel_for(channel_size, [&](int i) {
+            dst_data[i] = ie::PrecisionUtils::f32tof16(_variance[0]);
+        });
+    } else {
+        ie::parallel_for4d(H, W, _num_priors, 4, [&](int h, int w, int i, int j) {
+            dst_data[j + 4 * (i + _num_priors * (w + W * h))] = ie::PrecisionUtils::f32tof16(_variance[j]);
+        });
+    }
+}
+
+//
+// PriorBoxClusteredContent
+//
+
+PriorBoxClusteredContent::PriorBoxClusteredContent(
+        const DataDesc& inDesc0,
+        const DataDesc& inDesc1,
+        const DataDesc& outDesc,
+        const ie::CNNLayerPtr& layer) :
+        _inDesc0(inDesc0), _inDesc1(inDesc1), _outDesc(outDesc),
+        _layer(layer) {
+    IE_ASSERT(layer != nullptr);
+}
+
+size_t PriorBoxClusteredContent::byteSize() const {
+    return checked_cast<size_t>(_outDesc.totalDimSize()) *
+           checked_cast<size_t>(_outDesc.elemSize());
+}
+
+void PriorBoxClusteredContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(PriorBoxClusteredContent);
+
+    auto tempPtr = static_cast<fp16_t*>(tempBuf);
+
+    auto widths_ = _layer->GetParamAsFloats("width");
+    auto heights_ = _layer->GetParamAsFloats("height");
+    auto clip_ = _layer->GetParamAsInt("clip");
+    auto variance_ = _layer->GetParamAsFloats("variance");
+    auto img_h_ = _layer->GetParamAsInt("img_h", 0);
+    auto img_w_ = _layer->GetParamAsInt("img_w", 0);
+    auto step_ = _layer->GetParamAsFloat("step", 0);
+    auto step_h_ = _layer->GetParamAsFloat("step_h", 0);
+    auto step_w_ = _layer->GetParamAsFloat("step_w", 0);
+    auto offset_ = _layer->GetParamAsFloat("offset", 0);
+
+    auto num_priors_ = widths_.size();
+
+    if (variance_.empty()) {
+        variance_.push_back(0.1);
+    }
+
+    auto layer_width  = _inDesc0.dim(Dim::W);
+    auto layer_height = _inDesc0.dim(Dim::H);
+
+    auto img_width  = img_w_ == 0 ? _inDesc1.dim(Dim::W) : img_w_;
+    auto img_height = img_h_ == 0 ? _inDesc1.dim(Dim::H) : img_h_;
+
+    auto step_w = step_w_ == 0 ? step_ : step_w_;
+    auto step_h = step_h_ == 0 ? step_ : step_h_;
+    if (step_w == 0 || step_h == 0) {
+        step_w = static_cast<float>(img_width) / layer_width;
+        step_h = static_cast<float>(img_height) / layer_height;
+    }
+
+    auto expetected_output_dimx = layer_height * layer_width * num_priors_ * 4;
+    if (_outDesc.dim(Dim::W) != expetected_output_dimx || _outDesc.dim(Dim::H) != 2) {
+        THROW_IE_EXCEPTION << "PriorBoxClustered output has invalid dimension, exptected " << expetected_output_dimx << "x2"
+                           << ", got " << _outDesc.dim(Dim::W) << "x" << _outDesc.dim(Dim::H) << ", layer name is: " << _layer->name;
+    }
+
+    auto offset = _outDesc.dim(Dim::W);
+    auto var_size = variance_.size();
+
+    auto top_data_0 = tempPtr;
+    auto top_data_1 = top_data_0 + offset;
+
+    ie::parallel_for2d(layer_height, layer_width, [=](int h, int w) {
+        auto center_x = (w + offset_) * step_w;
+        auto center_y = (h + offset_) * step_h;
+
+        for (int s = 0; s < num_priors_; ++s) {
+            auto box_width  = widths_[s];
+            auto box_height = heights_[s];
+
+            auto xmin = (center_x - box_width  / 2.0f) / img_width;
+            auto ymin = (center_y - box_height / 2.0f) / img_height;
+            auto xmax = (center_x + box_width  / 2.0f) / img_width;
+            auto ymax = (center_y + box_height / 2.0f) / img_height;
+
+            if (clip_) {
+                xmin = std::min(std::max(xmin, 0.0f), 1.0f);
+                ymin = std::min(std::max(ymin, 0.0f), 1.0f);
+                xmax = std::min(std::max(xmax, 0.0f), 1.0f);
+                ymax = std::min(std::max(ymax, 0.0f), 1.0f);
+            }
+
+            top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 0] = ie::PrecisionUtils::f32tof16(xmin);
+            top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 1] = ie::PrecisionUtils::f32tof16(ymin);
+            top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 2] = ie::PrecisionUtils::f32tof16(xmax);
+            top_data_0[h * layer_width * num_priors_ * 4 + w * num_priors_ * 4 + s * 4 + 3] = ie::PrecisionUtils::f32tof16(ymax);
+
+            for (int j = 0; j < var_size; j++) {
+                auto index = h * layer_width * num_priors_ * var_size + w * num_priors_ * var_size + s * var_size + j;
+                top_data_1[index] = ie::PrecisionUtils::f32tof16(variance_[j]);
+            }
+        }
+    });
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/replicated_data_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/replicated_data_content.cpp
new file mode 100644 (file)
index 0000000..da01e72
--- /dev/null
@@ -0,0 +1,58 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/replicated_data_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
+namespace vpu {
+
+ReplicatedContent::ReplicatedContent(float val, int count, const DataDesc& desc) :
+        _factor{val}, _count(count), _desc(desc) {}
+
+ReplicatedContent::ReplicatedContent(DataContent::Ptr origContent, int count, const DataDesc& desc) :
+        _origContent(origContent), _count(count), _desc(desc) {}
+
+size_t ReplicatedContent::byteSize() const {
+    if (!_origContent) {
+        return checked_cast<size_t>(_count) * sizeof(fp16_t);
+    } else {
+        IE_ASSERT(_desc.totalDimSize() % _count == 0);
+
+        return checked_cast<size_t>(_desc.totalDimSize()) * sizeof(fp16_t);
+    }
+}
+
+void ReplicatedContent::fillTempBuf(void* tempBuf) const {
+    VPU_PROFILE(ReplicatedContent);
+
+    auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+    if (!_origContent) {
+        std::fill_n(dstPtr, _count, ie::PrecisionUtils::f32tof16(_factor));
+    } else {
+        IE_ASSERT(_desc.totalDimSize() % _count == 0);
+
+        auto origCount = _desc.totalDimSize() / _count;
+        auto origPtr = _origContent->get<fp16_t>();
+        IE_ASSERT(origPtr != nullptr);
+
+        ie::parallel_for(_count, [origPtr, origCount, dstPtr](int i) {
+            std::copy_n(origPtr, origCount, dstPtr + i * origCount);
+        });
+    }
+}
+
+DataContent::Ptr replicateContent(float val, int count, const DataDesc& desc) {
+    return std::make_shared<ReplicatedContent>(val, count, desc);
+}
+
+DataContent::Ptr replicateContent(const DataContent::Ptr& origContent, int count, const DataDesc& desc) {
+    return std::make_shared<ReplicatedContent>(origContent, count, desc);
+}
+
+} // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/model/data_contents/scaled_content.cpp b/inference-engine/src/vpu/graph_transformer/src/model/data_contents/scaled_content.cpp
new file mode 100644 (file)
index 0000000..c96635a
--- /dev/null
@@ -0,0 +1,41 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/model/data_contents/scaled_content.hpp>
+
+#include <vpu/utils/profiling.hpp>
+
+#include <ie_parallel.hpp>
+#include <precision_utils.h>
+
+namespace vpu {
+
+ScaledContent::ScaledContent(const DataContent::Ptr& origContent, float scale) :
+        _origContent(origContent), _factor(scale) {
+}
+
+size_t ScaledContent::byteSize() const {
+    return _origContent->byteSize();
+}
+
+void ScaledContent::fillTempBuf(void *tempBuf) const {
+    VPU_PROFILE(ScaledContent);
+
+    const auto totalSize = _origContent->byteSize() / sizeof(fp16_t);
+
+    auto srcPtr = _origContent->get<fp16_t>();
+    IE_ASSERT(srcPtr != nullptr);
+
+    auto dstPtr = static_cast<fp16_t*>(tempBuf);
+
+    ie::parallel_for(totalSize, [this, srcPtr, dstPtr](int i) {
+        dstPtr[i] = ie::PrecisionUtils::f32tof16(ie::PrecisionUtils::f16tof32(srcPtr[i]) * _factor);
+    });
+}
+
+DataContent::Ptr scaleContent(const DataContent::Ptr& origContent, float scale) {
+    return std::make_shared<ScaledContent>(origContent, scale);
+}
+
+} // namespace vpu
index 0371064..3645c59 100644 (file)
@@ -4,6 +4,14 @@
 
 #include <vpu/model/model.hpp>
 
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/auto_scope.hpp>
+#include <vpu/utils/profiling.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
+#include <details/caseless.hpp>
+#include "blob_factory.hpp"
+
 #include <cctype>
 #include <memory>
 #include <string>
 #include <exception>
 #include <algorithm>
 
-#include <details/caseless.hpp>
-
-#include <vpu/compile_env.hpp>
-#include <vpu/utils/auto_scope.hpp>
-#include <vpu/utils/profiling.hpp>
-
-#include "blob_factory.hpp"
-
 namespace vpu {
 
 //
@@ -96,6 +96,11 @@ Data ModelObj::addConstData(
         const DataContent::Ptr& content) {
     IE_ASSERT(content != nullptr);
 
+    VPU_THROW_UNLESS(desc.totalDimSize() * desc.elemSize() == content->byteSize(),
+        "duplicateData error: while duplicating {} Const data got different "
+        "newDesc and content byte sizes ({} and {} respectively)",
+        name, desc.totalDimSize() * desc.elemSize(), content->byteSize());
+
     std::shared_ptr<DataNode> data(new DataNode);
 
     data->_name = name;
@@ -104,7 +109,6 @@ Data ModelObj::addConstData(
     data->_model = this;
 
     data->_content = content;
-    content->_desc = desc;
 
     data->_ptrPosInModel = _dataPtrList.emplace(_dataPtrList.end(), data);
     _dataList.push_back(data);
@@ -120,7 +124,7 @@ Data ModelObj::addConstData(const std::string& name, const DataDesc& descriptor,
     if (generator) {
         generator(ieBlob);
     }
-    return addConstData(name, descriptor, ieBlobContent(ieBlob));
+    return addConstData(name, descriptor, ieBlobContent(ieBlob, descriptor.type()));
 }
 
 Data ModelObj::addNewData(
@@ -183,10 +187,15 @@ Data ModelObj::duplicateData(
     newData->_model = this;
 
     if (newDataUsage == DataUsage::Const) {
-        newData->_content = newContent != nullptr ? newContent : origData->content();
-        if (newContent != nullptr) {
-            newContent->_desc = newData->_desc;
-        }
+        const auto& content = newContent != nullptr ? newContent : origData->content();
+        const auto& desc = newDesc != DataDesc() ? newDesc : origData->desc();
+
+        VPU_THROW_UNLESS(desc.totalDimSize() * desc.elemSize() == content->byteSize(),
+            "duplicateData error: while duplicating {} Const data got different "
+            "desc and content byte sizes ({} and {} respectively)",
+            origData->name(), desc.totalDimSize() * desc.elemSize(), content->byteSize());
+
+        newData->_content = content;
     }
 
     newData->attrs().copyFrom(origData->attrs());
index 4ed95a7..1efb458 100644 (file)
@@ -4,76 +4,19 @@
 
 #include <vpu/frontend/frontend.hpp>
 
-#include <cmath>
-
-#include <vector>
-#include <memory>
-
-#include <precision_utils.h>
-#include <ie_parallel.hpp>
-
 #include <vpu/utils/ie_helpers.hpp>
 #include <vpu/utils/numeric.hpp>
 #include <vpu/utils/profiling.hpp>
+#include <vpu/model/data_contents/batch_norm_contents.hpp>
 
-namespace vpu {
-
-namespace {
-
-class BatchNormalizationWeightsContent final : public CalculatedDataContent {
-public:
-    BatchNormalizationWeightsContent(
-            const DataContent::Ptr& origContent,
-            float epsilon) :
-            CalculatedDataContent({origContent}), _epsilon(epsilon) {
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(BatchNormalizationWeightsContent);
-
-        auto srcPtr = baseContents[0]->get<fp16_t>();
-        auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
-        ie::parallel_for(desc().totalDimSize(), [this, srcPtr, dstPtr](int i) {
-            float val = ie::PrecisionUtils::f16tof32(srcPtr[i]) + _epsilon;
-            val = 1.0f / std::sqrt(val);
-            dstPtr[i] = ie::PrecisionUtils::f32tof16(val);
-        });
-    }
-
-private:
-    float _epsilon;
-};
-
-class BatchNormalizationBiasesContent final : public CalculatedDataContent {
-public:
-    BatchNormalizationBiasesContent(
-            const DataContent::Ptr& origContent,
-            const DataContent::Ptr& weightsContent) :
-            CalculatedDataContent({origContent, weightsContent}) {
-    }
-
-protected:
-    void fillTempBuf(const SmallVector<DataContent::Ptr, 2>& baseContents, void* tempBuf) const override {
-        VPU_PROFILE(BatchNormalizationBiasesContent);
-
-        auto origPtr = baseContents[0]->get<fp16_t>();
-        auto weightsPtr = baseContents[1]->get<fp16_t>();
-
-        auto dstPtr = static_cast<fp16_t*>(tempBuf);
-
-        ie::parallel_for(desc().totalDimSize(), [origPtr, weightsPtr, dstPtr](int i) {
-            // TODO : need to be extracted from IE layer.
-            float beta = 0.0f;
+#include <precision_utils.h>
+#include <ie_parallel.hpp>
 
-            auto wVal = ie::PrecisionUtils::f16tof32(weightsPtr[i]);
-            dstPtr[i] = ie::PrecisionUtils::f32tof16(beta - wVal * ie::PrecisionUtils::f16tof32(origPtr[i]));
-        });
-    }
-};
+#include <cmath>
+#include <vector>
+#include <memory>
 
-}  // namespace
+namespace vpu {
 
 void FrontEnd::parseBatchNorm(const Model& model, const ie::CNNLayerPtr& _layer, const DataVector& inputs, const DataVector& outputs) const {
     IE_ASSERT(inputs.size() == 1);
index 3b5db2c..55844e0 100644 (file)
@@ -4,6 +4,11 @@
 
 #include <vpu/frontend/frontend.hpp>
 
+#include <vpu/frontend/custom_layer.hpp>
+#include <vpu/utils/simple_math.hpp>
+#include <vpu/model/data_contents/kernel_binary_content.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
 #include <vector>
 #include <memory>
 #include <string>
 #include <algorithm>
 #include <tuple>
 
-#include <vpu/frontend/custom_layer.hpp>
-#include <vpu/utils/simple_math.hpp>
-
-
 namespace vpu {
 
 static void calcSizesFromParams(const DataDesc &desc, const SmallVector<std::string> &bufferSizeRules, SmallVector<int, 3> &sizes);
 
 namespace {
 
-class KernelBinaryContent final : public DataContent {
-public:
-    explicit KernelBinaryContent(const std::string& blob) : _blob(blob) {
-        IE_ASSERT(!_blob.empty());
-    }
-
-    const void* getRaw() const override {
-        IE_ASSERT(desc().totalDimSize() * desc().elemSize() == _blob.length());
-        return _blob.data();
-    }
-
-private:
-    std::string _blob;
-};
-
 class CustomStage final : public StageNode {
 public:
     using StageNode::StageNode;
index 80d35c0..8d8f6d3 100644 (file)
@@ -195,10 +195,10 @@ private:
         auto input2 = inputEdge(2)->input();
         auto output = outputEdge(0)->output();
 
-        input0->serializeBuffer(serializer, output->desc().dimsOrder());
+        input0->serializeBuffer(serializer);
         output->serializeBuffer(serializer);
-        input1->serializeBuffer(serializer, output->desc().dimsOrder());
-        input2->serializeBuffer(serializer, output->desc().dimsOrder());
+        input1->serializeBuffer(serializer);
+        input2->serializeBuffer(serializer);
     }
 };
 
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/exp_topkrois.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/exp_topkrois.cpp
new file mode 100644 (file)
index 0000000..e1ff644
--- /dev/null
@@ -0,0 +1,103 @@
+// Copyright (C) 2019-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <memory>
+
+namespace vpu {
+
+namespace {
+
+class ExpTopKROIsStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ExpTopKROIsStage>(*this);
+    }
+
+    void propagateDataOrderImpl(StageDataInfo<DimsOrder>& orderInfo) override {
+    }
+
+    void getDataStridesRequirementsImpl(StageDataInfo<StridesRequirement>& stridesInfo) override {
+        for (const auto& inEdge : inputEdges()) {
+            stridesInfo.setInput(inEdge, StridesRequirement::compact());
+        }
+        for (const auto& outEdge : outputEdges()) {
+            stridesInfo.setOutput(outEdge, StridesRequirement::compact());
+        }
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    void getBatchSupportInfoImpl(StageDataInfo<BatchSupport>& batchInfo) override {
+    }
+
+    void initialCheckImpl() const override {
+        assertInputsOutputsTypes(this,
+             {{DataType::FP16}, {DataType::FP16}},
+             {{DataType::FP16}});
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        const auto& params = attrs().get<int32_t>("max_rois");
+
+        serializer.append(params);
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        input(0)->serializeBuffer(serializer);
+        input(1)->serializeBuffer(serializer);
+        output(0)->serializeBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseExpTopKROIs(
+        const Model& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) const {
+    VPU_THROW_UNLESS(inputs.size() == 2, "Layer %s must have 2 input tensors.", layer->name);
+    VPU_THROW_UNLESS(outputs.size() == 1, "Layer %s must have 1 output tensor.", layer->name);
+
+    int32_t max_rois = layer->GetParamAsInt("max_rois", 0);
+
+    auto inputRois  = inputs[0];
+    auto inputProbs = inputs[1];
+    auto outputRois = outputs[0];
+
+    VPU_THROW_UNLESS((inputRois->desc().dims().size() == 2) &&
+                     (inputRois->desc().dim(Dim::C) == 4),
+                     "Wrong shape for input 0 of layer %s, expected (N, 4), got: dims size = %lu, dim C = %d",
+                     layer->name, inputRois->desc().dims().size(), inputRois->desc().dim(Dim::C));
+
+    VPU_THROW_UNLESS(inputProbs->desc().dims().size() == 1,
+                     "Wrong shape for input 1 of layer %s, expected dim size = 1, got: %lu",
+                     layer->name, inputProbs->desc().dims().size());
+
+    VPU_THROW_UNLESS(inputProbs->desc().dim(Dim::C) == inputRois->desc().dim(Dim::N),
+                     "Layer %s: input0 dim N and input1 dim C must be equal, got: input0 (N = %d), input1 (C = %d)",
+                     layer->name, inputProbs->desc().dim(Dim::N), inputProbs->desc().dim(Dim::C));
+
+    VPU_THROW_UNLESS((outputRois->desc().dims().size() == 2) &&
+                     (outputRois->desc().dim(Dim::C) == 4),
+                     "Wrong shape for output 0 of layer %s, expected (N, 4), got: dims size = %lu, dim C = %d",
+                     layer->name, outputRois->desc().dims().size(), outputRois->desc().dim(Dim::C));
+
+    VPU_THROW_UNLESS(outputRois->desc().dim(Dim::N) == max_rois,
+                     "Wrong shape for output 0 of layer %s, expected dim N = %d, got: dim N = %d",
+                     layer->name, static_cast<int>(max_rois), outputRois->desc().dim(Dim::N));
+
+    auto stage = model->addNewStage<ExpTopKROIsStage>(
+        layer->name,
+        StageType::ExpTopKROIs,
+        layer,
+        inputs,
+        outputs);
+
+    stage->attrs().set("max_rois", max_rois);
+}
+
+}  // namespace vpu
index ff89891..901f9f8 100644 (file)
@@ -63,7 +63,7 @@ protected:
     }
 
     StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
-        return StageSHAVEsRequirements::OnlyOne;
+        return StageSHAVEsRequirements::NotNeeded;
     }
 
     void initialCheckImpl() const override {
index 1e18a98..8562df3 100644 (file)
@@ -4,6 +4,13 @@
 
 #include <vpu/frontend/frontend.hpp>
 
+#include <vpu/graph_transformer.hpp>
+#include <vpu/compile_env.hpp>
+#include <vpu/utils/file_system.hpp>
+#include <vpu/model/data_contents/mtcnn_blob_content.hpp>
+
+#include <cpp/ie_cnn_net_reader.h>
+
 #include <vector>
 #include <fstream>
 #include <string>
 #include <memory>
 #include <set>
 
-#include <cpp/ie_cnn_net_reader.h>
-
-#include <vpu/graph_transformer.hpp>
-#include <vpu/compile_env.hpp>
-#include <vpu/utils/file_system.hpp>
-
 namespace vpu {
 
 // Must be synchronized with MvTensor
@@ -94,21 +95,6 @@ private:
     }
 };
 
-class MTCNNBlobContent final : public DataContent {
-public:
-    explicit MTCNNBlobContent(std::vector<char>&& blob) : _blob(std::forward<std::vector<char>>(blob)) {
-        IE_ASSERT(!_blob.empty());
-    }
-
-    const void* getRaw() const override {
-        IE_ASSERT(desc().totalDimSize() * desc().elemSize() == _blob.size());
-        return _blob.data();
-    }
-
-private:
-    std::vector<char> _blob;
-};
-
 std::pair<int, int> getResolution(const std::string& str) {
     std::istringstream stream(str);
     std::string output;
@@ -264,7 +250,7 @@ void FrontEnd::parseMTCNN(const Model& model, const ie::CNNLayerPtr& layer, cons
     auto innerGraphsDesc = DataDesc({mergedBlob.size()});
     innerGraphsDesc.setType(DataType::U8);
 
-    auto innerGraphs = model->addConstData(layer->name + "@innerGraphs", innerGraphsDesc, std::make_shared<MTCNNBlobContent>(std::move(mergedBlob)));
+    auto innerGraphs = model->addConstData(layer->name + "@innerGraphs", innerGraphsDesc, std::make_shared<MTCNNBlobContent>(mergedBlob));
 
     auto stage = model->addNewStage<MTCNNStage>(layer->name, StageType::MTCNN, layer, {input, innerGraphs}, {output});
     stage->attrs().set("pyramid", pyramid);
index bd83d35..536093f 100644 (file)
@@ -101,17 +101,17 @@ void MyriadXHwStage::finalCheckImpl() const {
     const auto input = inputEdge(0)->input();
     const auto output = outputEdge(0)->output();
 
-    IE_ASSERT(input->memoryOffset() % 16 == 0);
-    IE_ASSERT(output->memoryOffset() % 16 == 0);
+    IE_ASSERT(input->dataLocation().offset % 16 == 0);
+    IE_ASSERT(output->dataLocation().offset % 16 == 0);
 
     if (attrs().get<HwOpType>("hwOpType") != HwOpType::POOL) {
         const auto weights = inputEdge(1)->input();
         const auto biases = inputEdge(2)->input();
         const auto scales = inputEdge(3)->input();
 
-        IE_ASSERT(weights->memoryOffset() % 16 == 0);
-        IE_ASSERT(biases->memoryOffset() % 16 == 0);
-        IE_ASSERT(scales->memoryOffset() % 16 == 0);
+        IE_ASSERT(weights->dataLocation().offset % 16 == 0);
+        IE_ASSERT(biases->dataLocation().offset % 16 == 0);
+        IE_ASSERT(scales->dataLocation().offset % 16 == 0);
     }
 }
 
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/nonzero.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/nonzero.cpp
new file mode 100644 (file)
index 0000000..010337b
--- /dev/null
@@ -0,0 +1,120 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+#include <precision_utils.h>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+namespace {
+
+class NonZero : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<NonZero>(*this);
+    }
+
+    void propagateDataOrderImpl(StageDataInfo<DimsOrder>& orderInfo) override {
+    }
+
+    void getDataStridesRequirementsImpl(StageDataInfo<StridesRequirement>& stridesInfo) override {
+        auto inputStrides = input(0)->requiredStrides();
+        auto outIndicesStrides = output(0)->requiredStrides();
+        auto outDimsStrides = output(1)->requiredStrides();
+
+        stridesInfo.setInput(inputEdge(0), inputStrides.add(0, DimStride::Compact));
+        stridesInfo.setOutput(outputEdge(0), outIndicesStrides.add(0, DimStride::Compact));
+        stridesInfo.setOutput(outputEdge(1), outDimsStrides.add(0, DimStride::Compact));
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    void getBatchSupportInfoImpl(StageDataInfo<BatchSupport>& batchInfo) override {
+    }
+
+    void initialCheckImpl() const override {
+        assertInputsOutputsTypes(this,
+                                 {{DataType::FP16, DataType::U8, DataType::S32}},
+                                 {{DataType::S32}, {DataType::S32}});
+    }
+
+    void finalCheckImpl() const override {
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        VPU_INTERNAL_CHECK(numInputs() == 1,
+                           "Nonzero stage with name %s must have only 1 input, "
+                           "actually provided %d", name(), numInputs());
+        VPU_INTERNAL_CHECK(numOutputs() == 2,
+                           "Nonzero stage with name %s must have only 2 outputs, "
+                           "actually provided %d", name(), numOutputs());
+
+        input(0)->serializeBuffer(serializer);
+        output(0)->serializeBuffer(serializer);
+        output(1)->serializeBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseNonZero(
+        const Model& model,
+        const ie::CNNLayerPtr& layer,
+        const DataVector& inputs,
+        const DataVector& outputs) const {
+    VPU_THROW_UNLESS(inputs.size() == 1,
+                     "Nonzero layer with name %s must have only 1 input, actually provided %d",
+                     layer->name, inputs.size());
+    VPU_THROW_UNLESS(outputs.size() == 2,
+                     "Nonzero layer with name %s must have only 2 outputs, actually provided %d",
+                     layer->name, outputs.size());
+
+    const auto input = inputs[0];
+    const auto inputNumDims = input->desc().numDims();
+    const auto totalIndicesDimSize = input->desc().totalDimSize();
+
+    const auto outIndicesDesc = outputs[0]->desc();
+    const auto outIndicesPerm = outIndicesDesc.dimsOrder().toPermutation();
+    const auto minorIndicesDim = outIndicesDesc.dim(outIndicesPerm.at(0));
+    const auto majorIndicesDim = outIndicesDesc.dim(outIndicesPerm.at(1));
+    VPU_THROW_UNLESS(outIndicesDesc.numDims() == 2,
+                     "NonZero layer with name %s must have 2D output Indices tensor, "
+                     "actually provided %dD tensor",
+                     layer->name, outIndicesDesc.numDims());
+    VPU_THROW_UNLESS(minorIndicesDim >= totalIndicesDimSize,
+                     "NonZero layer with name %s must have output Indices tensor with minor dim "
+                     "size >= total amount of elements of input tensor, actually provided %d >= %d",
+                     layer->name, minorIndicesDim, totalIndicesDimSize);
+    VPU_THROW_UNLESS(majorIndicesDim == inputNumDims,
+                     "NonZero layer with name %s must have output Indices tensor with major dim "
+                     "size == number of dimensions of input tensor, actually provided %d == %d",
+                     layer->name, majorIndicesDim, inputNumDims);
+
+    const auto outDimsDesc = outputs[1]->desc();
+    const auto outDimsPerm = outDimsDesc.dimsOrder().toPermutation();
+    const auto minorDimsDim = outDimsDesc.dim(outDimsPerm.at(0));
+    VPU_THROW_UNLESS(outDimsDesc.numDims() == 1,
+                     "NonZero layer with name %s must have 1D output Dims tensor, "
+                     "actually provided %dD tensor",
+                     layer->name, outDimsDesc.numDims());
+    VPU_THROW_UNLESS(minorDimsDim >= 2,
+                     "NonZero layer with name %s must have output Dims tensor with minor dim "
+                     "size >= 2, actually provided %d",
+                     layer->name, minorDimsDim);
+
+    model->addNewStage<NonZero>(
+            layer->name,
+            StageType::NonZero,
+            layer,
+            inputs,
+            outputs);
+}
+
+}  // namespace vpu
index 2779a99..c4f1653 100644 (file)
@@ -4,6 +4,8 @@
 
 #include <vpu/frontend/frontend.hpp>
 
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
 #include <vector>
 #include <map>
 #include <unordered_set>
index c7e74e1..8a5d448 100644 (file)
@@ -4,11 +4,16 @@
 
 #include <vpu/frontend/frontend.hpp>
 
+#include <vpu/stages/post_op_stage.hpp>
+#include <vpu/utils/ie_helpers.hpp>
+#include <vpu/utils/profiling.hpp>
+#include <vpu/model/data_contents/prelu_blob_content.hpp>
+
+#include <ie_parallel.hpp>
+
 #include <vector>
 #include <memory>
 
-#include <vpu/stages/post_op_stage.hpp>
-
 namespace vpu {
 
 namespace {
@@ -47,7 +52,8 @@ void FrontEnd::parsePReLU(const Model& model, const ie::CNNLayerPtr& layer, cons
     auto weights = model->addConstData(
         layer->name + "@weights",
         DataDesc({output->desc().dim(Dim::C)}),
-        ieBlobContent(weightsBlob, channelShared ? output->desc().dim(Dim::C) : 1));
+        std::make_shared<PReLUBlobContent>(weightsBlob, DataDesc({output->desc().dim(Dim::C)}),
+                                           channelShared ? output->desc().dim(Dim::C) : 1));
 
     model->addNewStage<PReluStage>(layer->name, StageType::PRelu, layer, {inputs[0], weights}, outputs);
 }
index cc88e73..6d211d3 100644 (file)
@@ -4,6 +4,8 @@
 
 #include <vpu/frontend/frontend.hpp>
 
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
 #include <algorithm>
 #include <memory>
 #include <set>
@@ -91,7 +93,7 @@ private:
             input1,
             "",
             DataDesc(),
-            ieBlobContent(newIndicesBlob));
+            ieBlobContent(newIndicesBlob, DataType::S32));
 
         model()->replaceStageInput(inputEdge(1), newList);
     }
index dba670d..893a4ec 100644 (file)
@@ -4,13 +4,14 @@
 
 #include <vpu/frontend/frontend.hpp>
 
+#include <vpu/utils/numeric.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
+
 #include <vector>
 #include <string>
 #include <memory>
 #include <set>
 
-#include <vpu/utils/numeric.hpp>
-
 namespace vpu {
 
 namespace {
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/roi_align.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/roi_align.cpp
new file mode 100644 (file)
index 0000000..2241dfe
--- /dev/null
@@ -0,0 +1,109 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <vector>
+#include <string>
+#include <unordered_set>
+#include <memory>
+#include <set>
+
+namespace vpu {
+
+VPU_DECLARE_ENUM(ROIAlignMode,
+    Average = 0,
+    Max = 1
+)
+
+static const std::string s_mode = "mode";
+static const std::string s_pooled_w = "pooled_w";
+static const std::string s_pooled_h = "pooled_h";
+static const std::string s_sampling_ratio = "sampling_ratio";
+static const std::string s_spatial_scale = "spatial_scale";
+
+namespace {
+
+class ROIAlignStage final : public StageNode {
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ROIAlignStage>(*this);
+    }
+
+    void propagateDataOrderImpl(StageDataInfo<DimsOrder>& orderInfo) override {
+        orderInfo.setInput(inputEdge(0), inputEdge(0)->input()->desc().dimsOrder().createMovedDim(Dim::C, 2));
+        orderInfo.setOutput(outputEdge(0), outputEdge(0)->output()->desc().dimsOrder().createMovedDim(Dim::C, 2));
+    }
+
+    void getDataStridesRequirementsImpl(StageDataInfo<StridesRequirement>& stridesInfo) override {
+        for (const auto& inEdge : inputEdges()) {
+            stridesInfo.setInput(inEdge, StridesRequirement::compact());
+        }
+        for (const auto& outEdge : outputEdges()) {
+            stridesInfo.setOutput(outEdge, StridesRequirement::compact());
+        }
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    void getBatchSupportInfoImpl(StageDataInfo<BatchSupport>& batchInfo) override {
+    }
+
+    void initialCheckImpl() const override {
+        assertInputsOutputsTypes(this, {{DataType::FP16}, {DataType::FP16}, {DataType::S32}}, {{DataType::FP16}});
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+        const auto pooled_w = attrs().get<int>(s_pooled_w);
+        const auto pooled_h = attrs().get<int>(s_pooled_h);
+        const auto sampling_ratio = attrs().get<int>(s_sampling_ratio);
+        const auto spatial_scale = attrs().get<float>(s_spatial_scale);
+        const auto mode = attrs().get<ROIAlignMode>(s_mode);
+
+        serializer.append(static_cast<uint32_t>(pooled_w));
+        serializer.append(static_cast<uint32_t>(pooled_h));
+        serializer.append(static_cast<uint32_t>(sampling_ratio));
+        serializer.append(static_cast<float>(spatial_scale));
+        serializer.append(static_cast<ROIAlignMode>(mode));
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        for (int i = 0; i < numInputs(); i++) {
+            inputEdge(i)->input()->serializeBuffer(serializer);
+        }
+
+        outputEdge(0)->output()->serializeBuffer(serializer);
+    }
+};
+
+}  // namespace
+
+void FrontEnd::parseROIAlign(const Model& model, const ie::CNNLayerPtr& layer, const DataVector& inputs, const DataVector& outputs) const {
+    VPU_THROW_UNLESS(inputs.size() == 3,
+                    "ROIAlign stage with name {} has invalid number of inputs: expected 3, "
+                    "actually provided {}", layer->name, inputs.size());
+
+    VPU_THROW_UNLESS(outputs.size() == 1,
+                    "ROIAlign stage with name {} has invalid number of outputs: expected 1, "
+                    "actually provided {}", layer->name, outputs.size());
+
+    const auto stage = model->addNewStage<ROIAlignStage>(layer->name, StageType::ROIAlign, layer, inputs, outputs);
+    const auto mode = layer->GetParamAsString("mode", "");
+
+    if (mode == "avg") {
+        stage->attrs().set<ROIAlignMode>(s_mode, ROIAlignMode::Average);
+    } else if (mode == "max") {
+        stage->attrs().set<ROIAlignMode>(s_mode, ROIAlignMode::Max);
+    } else {
+        VPU_THROW_FORMAT("Layer with name {} supports only (avg, max) mode", layer->name);
+    }
+
+    stage->attrs().set<int>(s_pooled_w, layer->GetParamAsInt("pooled_w"));
+    stage->attrs().set<int>(s_pooled_h, layer->GetParamAsInt("pooled_h"));
+    stage->attrs().set<int>(s_sampling_ratio, layer->GetParamAsInt("sampling_ratio"));
+    stage->attrs().set<float>(s_spatial_scale, layer->GetParamAsFloat("spatial_scale"));
+}
+
+}  // namespace vpu
diff --git a/inference-engine/src/vpu/graph_transformer/src/stages/scatter_update.cpp b/inference-engine/src/vpu/graph_transformer/src/stages/scatter_update.cpp
new file mode 100644 (file)
index 0000000..6e5ec13
--- /dev/null
@@ -0,0 +1,261 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vpu/frontend/frontend.hpp>
+
+#include <memory>
+#include <string>
+
+namespace vpu {
+
+using InferenceEngine::CNNLayerPtr;
+
+//----------------------------------------------------------------------
+
+namespace {
+
+class ScatterUpdateStage final : public StageNode {
+public:
+    using StageNode::StageNode;
+
+private:
+    StagePtr cloneImpl() const override {
+        return std::make_shared<ScatterUpdateStage>(*this);
+    }
+
+    void propagateDataOrderImpl(StageDataInfo<DimsOrder>& orderInfo) override {
+        const auto data = inputEdge(0)->input();
+        const auto indices = inputEdge(1)->input();
+        const auto updates = inputEdge(2)->input();
+        const auto axis = inputEdge(3)->input();
+        const auto output = outputEdge(0)->output();
+        orderInfo.setInput(inputEdge(0), DimsOrder::fromNumDims(data->desc().numDims()));
+        orderInfo.setInput(inputEdge(1), DimsOrder::fromNumDims(indices->desc().numDims()));
+        orderInfo.setInput(inputEdge(2), DimsOrder::fromNumDims(updates->desc().numDims()));
+        orderInfo.setInput(inputEdge(3), DimsOrder::fromNumDims(axis->desc().numDims()));
+        orderInfo.setOutput(outputEdge(0), DimsOrder::fromNumDims(output->desc().numDims()));
+    }
+
+    void getDataStridesRequirementsImpl(StageDataInfo<StridesRequirement>& stridesInfo) override {
+        stridesInfo.setInput(inputEdge(0), StridesRequirement::compact());    // `data`    tensor
+        stridesInfo.setInput(inputEdge(2), StridesRequirement::compact());    // `updates` tensor
+        stridesInfo.setOutput(outputEdge(0), StridesRequirement::compact());  // `output`  tensor
+    }
+
+    void finalizeDataLayoutImpl() override {
+    }
+
+    void getBatchSupportInfoImpl(StageDataInfo<BatchSupport>& /*batchInfo*/) override {
+    }
+
+    StageSHAVEsRequirements getSHAVEsRequirementsImpl() const override {
+        return StageSHAVEsRequirements::NotNeeded;
+    }
+
+    void initialCheckImpl() const override {
+        assertInputsOutputsTypes(this,
+            // `data`        ,  `indices`     , `updates`       , `axis` tensor
+            {{DataType::FP16}, {DataType::S32}, {DataType::FP16}, {DataType::S32}},
+            {{DataType::FP16}});
+    }
+
+    void serializeDataImpl(BlobSerializer& serializer) const override {
+        auto data    = input(0);
+        auto indices = input(1);
+        auto updates = input(2);
+        auto axis    = input(3);
+        auto out = output(0);
+
+        data->serializeBuffer(serializer);
+        out->serializeBuffer(serializer);
+        indices->serializeBuffer(serializer);
+        updates->serializeBuffer(serializer);
+        axis->serializeBuffer(serializer);
+    }
+
+    void serializeParamsImpl(BlobSerializer& serializer) const override {
+    }
+};
+
+}  // namespace
+
+//----------------------------------------------------------------------
+
+static
+void checkTensorShapes(const vpu::Data& input,
+                       const vpu::Data& output,
+                       const vpu::Data& indices,
+                       const vpu::Data& updates,
+                       const vpu::Data& axis) {
+    const DataDesc& inputDesc = input->desc();
+    const DataDesc& outputDesc = output->desc();
+    const DataDesc& indicesDesc = indices->desc();
+    const DataDesc& updatesDesc = updates->desc();
+    const DataDesc& axisDesc = axis->desc();
+
+    const auto inputType = inputDesc.type();
+    const auto outputType = outputDesc.type();
+    const auto indicesType = indicesDesc.type();
+    const auto updatesType = updatesDesc.type();
+    const auto axisType = axisDesc.type();
+
+    VPU_THROW_UNLESS(inputType == DataType::FP16, "input type is invalid");
+    VPU_THROW_UNLESS(outputType == DataType::FP16, "output type is invalid");
+    VPU_THROW_UNLESS(indicesType == DataType::S32, "indices type is invalid");
+    VPU_THROW_UNLESS(updatesType == DataType::FP16, "updates type is invalid");
+    VPU_THROW_UNLESS(axisType == DataType::S32, "axis type is invalid");
+
+    const int inputNDims = inputDesc.numDims();
+    const int outputNDims = outputDesc.numDims();
+    const int indicesNDims = indicesDesc.numDims();
+    const int updatesNDims = updatesDesc.numDims();
+    const int axisNDims = axisDesc.numDims();
+
+    VPU_THROW_UNLESS(inputNDims > 0, "input tensor must not be 0-dimensional");
+    VPU_THROW_UNLESS(outputNDims > 0, "output tensor must not be 0-dimensional");
+    VPU_THROW_UNLESS(indicesNDims > 0, "indices tensor must not be 0-dimensional");
+    VPU_THROW_UNLESS(updatesNDims > 0, "updates tensor must not be 0-dimensional");
+    VPU_THROW_UNLESS(axisNDims > 0, "axis tensor must not be 0-dimensional");
+
+    VPU_THROW_UNLESS(inputNDims == outputNDims,
+                     "input and output have different shapes: inputNDims={}, outputNDims={}",
+                     inputNDims, outputNDims);
+
+    VPU_THROW_UNLESS(updatesNDims == indicesNDims + outputNDims - 1,
+                     "incompatible shapes: indicesNDims=%d, updatesNDims={}, outputNDims={}",
+                     indicesNDims, updatesNDims, outputNDims);
+
+    VPU_THROW_UNLESS(axisNDims == 1,
+                     "axis tensor must be 1-dimensional, but axisNDims={}",
+                     axisNDims);
+
+    const DimsOrder inputDimsOrder = inputDesc.dimsOrder();
+    const DimsOrder outputDimsOrder = outputDesc.dimsOrder();
+    const DimsOrder indicesDimsOrder = indicesDesc.dimsOrder();
+    const DimsOrder updatesDimsOrder = updatesDesc.dimsOrder();
+    const DimsOrder axisDimsOrder = axisDesc.dimsOrder();
+
+    VPU_THROW_UNLESS(inputDimsOrder == outputDimsOrder, "input/output must have same layout"
+                     ", but inputDimsOrder = \"{}\", and outputDimsOrder = \"{}\"",
+                     inputDimsOrder, outputDimsOrder);
+
+    // Check if tensor shapes fit each other, e.g.:
+    //    {N, C, H, W} could be shape of `input` and `output`
+    // {I, J, C, H, W} could be shape of `update` tensor
+    // {I, J}          could be shape of `indices`
+
+    const DimValues& inputDims = inputDesc.dims();
+    const DimValues& outputDims = outputDesc.dims();
+    const DimValues& indicesDims = indicesDesc.dims();
+    const DimValues& updatesDims = updatesDesc.dims();
+    const DimValues& axisDims = axisDesc.dims();
+
+    VPU_THROW_UNLESS(inputDims == outputDims, "input/output tensors must have same lengths"
+                     ", but inputDims = \"{}\", and outputDims = \"{}\"", inputDims, outputDims);
+
+    // Permutation is array of dims, from minor to major
+    const DimVector inputPerm = inputDimsOrder.toPermutation();
+    const DimVector indicesPerm = indicesDimsOrder.toPermutation();
+    const DimVector updatesPerm = updatesDimsOrder.toPermutation();
+
+    // Check if the updates fits the input, e.g.:
+    //    {N, C, H, W} could be shape of `input` and `output`
+    // {I, J, C, H, W} could be shape of `update` tensor
+    for (int i = 0; i < inputNDims - 1; i++) {
+        const Dim inputDim = inputPerm[i];
+        const Dim updatesDim = updatesPerm[i];
+        const int inputSize = inputDims[inputDim];
+        const int updatesSize = updatesDims[updatesDim];
+        VPU_THROW_UNLESS(inputSize == updatesSize,
+                         "updates size must fit input along corresponding axes, "
+                         "but for axis={}: input size={}, updates size={}",
+                         i, inputSize, updatesSize);
+    }
+
+    // Check if the updates fits the indices, e.g.:
+    // {I, J, C, H, W} could be shape of `update` tensor
+    // {I, J}          could be shape of `indices`
+    for (int i = inputNDims - 1; i < updatesNDims; i++) {
+        const int i0 = i - (inputNDims - 1);
+        const Dim indicesDim = indicesPerm[i0];
+        const Dim updatesDim = updatesPerm[i];
+        const int indicesSize = indicesDims[indicesDim];
+        const int updatesSize = updatesDims[updatesDim];
+        VPU_THROW_UNLESS(indicesSize == updatesSize,
+                         "updates size must fit indices along corresponding axes, "
+                         "but for axis={}: indices size={}, updates size={}",
+                         i, indicesSize, updatesSize);
+    }
+
+    // Note, that for a 1D tensor the layout is "C"
+    VPU_THROW_UNLESS(axisDimsOrder == DimsOrder::C,
+                     "axis must be 1D tensor, but its dims order is {}",
+                     axisDimsOrder);
+    VPU_THROW_UNLESS(axisDims[Dim::C] == 1,
+                     "axis tensor must be 1D array of 1 element, but axis length = %d",
+                     axisDims[Dim::C]);
+}
+
+void FrontEnd::parseScatterUpdate(const Model      & model,
+                                  const CNNLayerPtr& layer,
+                                  const DataVector & inputs,
+                                  const DataVector & outputs) const {
+    VPU_THROW_UNLESS(inputs.size() == 4, "invalid number of inputs: %lu", inputs.size());
+    VPU_THROW_UNLESS(outputs.size() == 1, "invalid number of outputs: %lu", outputs.size());
+
+    const auto& input   = inputs[0];  // `data` tensor
+    const auto& indices = inputs[1];
+    const auto& updates = inputs[2];
+    const auto& axis    = inputs[3];
+    const auto& output = outputs[0];
+
+    checkTensorShapes(input, output, indices, updates, axis);
+
+    auto scatterUpdateLayer = std::dynamic_pointer_cast<ie::ScatterUpdateLayer>(layer);
+
+    VPU_THROW_UNLESS(scatterUpdateLayer != nullptr,
+                     "this layer is not an instance of ScatterUpdateLayer: "
+                     "layer name = \"%s\", layer type = \"%s\"",
+                     layer->name.c_str(), layer->type.c_str());
+
+    auto stage = model->addNewStage<ScatterUpdateStage>(layer->name,
+                                                        StageType::ScatterUpdate,
+                                                        layer,
+                                                        {input, indices, updates, axis},
+                                                        {output});
+
+    VPU_THROW_UNLESS(stage != nullptr,
+                     "failed to create ScatterUpdateStage: "
+                     "layer name = \"%s\", layer type = \"%s\"",
+                     layer->name.c_str(), layer->type.c_str());
+}
+
+//----------------------------------------------------------------------
+
+Stage StageBuilder::addScatterUpdateStage(
+        const Model& model,
+        const std::string& name,
+        const ie::CNNLayerPtr& layer,
+        const Data& input,
+        const Data& output,
+        const Data& indices,
+        const Data& updates,
+        const Data& axis) {
+    checkTensorShapes(input, output, indices, updates, axis);
+
+    auto stage = model->addNewStage<ScatterUpdateStage>(name,
+                                                        StageType::ScatterUpdate,
+                                                        layer,
+                                                        {input, indices, updates, axis},
+                                                        {output});
+
+    VPU_THROW_UNLESS(stage != nullptr,
+                     "failed to create ScatterUpdateStage: "
+                     "layer name = \"%s\", layer type = \"%s\"",
+                     layer->name.c_str(), layer->type.c_str());
+
+    return stage;
+}
+
+}  // namespace vpu
index ca5119d..2c3ad81 100644 (file)
@@ -79,10 +79,10 @@ private:
         auto inputBiases = inputEdge(2)->input();
         auto output = outputEdge(0)->output();
 
-        input->serializeBuffer(serializer, output->desc().dimsOrder());
+        input->serializeBuffer(serializer);
         output->serializeBuffer(serializer);
-        inputScales->serializeBuffer(serializer, output->desc().dimsOrder());
-        inputBiases->serializeBuffer(serializer, output->desc().dimsOrder());
+        inputScales->serializeBuffer(serializer);
+        inputBiases->serializeBuffer(serializer);
     }
 };
 
index 0dedeb1..fa4a3b0 100644 (file)
@@ -7,6 +7,7 @@
 #include "vpu/utils/auto_scope.hpp"
 #include "vpu/compile_env.hpp"
 #include "graph_transformer.h"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
 
 #include "ie_layers_internal.hpp"
 #include "net_pass.h"
@@ -96,8 +97,8 @@ void FrontEnd::parseTensorIterator(const Model& model, const ie::CNNLayerPtr& la
         VPU_THROW_UNLESS(isConst(original), "VPU const data object can be created only from const IE data object");
 
         const auto& creator = original->getCreatorLayer().lock();
-        const auto& blob = ieBlobContent(creator->blobs.begin()->second);
         const auto& descriptor = createDescriptor(original->getTensorDesc());
+        const auto& blob = ieBlobContent(creator->blobs.begin()->second, descriptor.type());
 
         return model->addConstData(original->getName(), descriptor, blob);
     };
index b35910f..bf30d12 100644 (file)
@@ -43,7 +43,7 @@ endif()
 # "mvnc" must be the first library in the link list
 target_link_libraries(${TARGET_NAME}
     PRIVATE
-        mvnc ${INTEL_ITT_LIBS} ${NGRAPH_LIBRARIES} inference_engine vpu_graph_transformer)
+        mvnc ${INTEL_ITT_LIBS} inference_engine vpu_graph_transformer)
 
 # install
 
index 6d2c5a7..76978c8 100644 (file)
@@ -95,10 +95,12 @@ ExecutableNetwork::ExecutableNetwork(
 void ExecutableNetwork::Import(std::istream& strm,
                                std::vector<DevicePtr> &devicePool,
                                const MyriadConfig& config) {
-    std::ostringstream blobContentStream;
-    blobContentStream << strm.rdbuf();
-    const std::string& blobContentString = blobContentStream.str();
-    std::copy(blobContentString.begin(), blobContentString.end(), std::back_inserter(_graphBlob));
+    auto currentPos = strm.tellg();
+    strm.seekg(0, strm.end);
+    auto blobSize = strm.tellg() - currentPos;
+    _graphBlob.resize(static_cast<size_t>(blobSize));
+    strm.seekg(currentPos, strm.beg);
+    strm.read(&_graphBlob[0], blobSize);
 
     if (!_device->isBooted()) {
         return;
index a4b575d..65044cf 100644 (file)
@@ -8,7 +8,7 @@
 #include <utility>
 
 #include <ie_metric_helpers.hpp>
-#include <cnn_network_ngraph_impl.hpp>
+#include <cpp/ie_cnn_network.h>
 #include <cpp_interfaces/base/ie_plugin_base.hpp>
 #include <cpp_interfaces/impl/ie_executable_network_internal.hpp>
 
@@ -16,6 +16,8 @@
 #include <vpu/parsed_config.hpp>
 #include <vpu/utils/profiling.hpp>
 #include <vpu/utils/error.hpp>
+#include <vpu/ngraph/transformations/dynamic_to_static_shape.hpp>
+#include <generic_ie.hpp>
 
 #include "myriad_plugin.h"
 
@@ -33,12 +35,10 @@ ExecutableNetworkInternal::Ptr Engine::LoadExeNetworkImpl(
     auto parsedConfigCopy = _parsedConfig;
     parsedConfigCopy.update(config);
 
-    std::shared_ptr<ICNNNetwork> clonedNetwork(nullptr);
-
-    if (auto networkNGraph = dynamic_cast<const CNNNetworkNGraphImpl*>(&network)) {
-        clonedNetwork = networkNGraph->cloneNGraphImpl();
-    } else {
-        clonedNetwork = cloneNet(network);
+    std::shared_ptr<ICNNNetwork> clonedNetwork = cloneNetwork(network);
+    if (auto func = clonedNetwork->getFunction()) {
+        ngraph::op::GenericIE::DisableReshape noReshape(func);
+        ngraph::pass::DynamicToStaticShape().run_on_function(func);
     }
 
     return std::make_shared<ExecutableNetwork>(*clonedNetwork, _devicePool, parsedConfigCopy);
@@ -76,6 +76,12 @@ void Engine::QueryNetwork(
     auto parsedConfigCopy = _parsedConfig;
     parsedConfigCopy.update(config);
 
+    const auto deviceName = parsedConfigCopy.deviceName();
+    if (!deviceName.empty()) {
+        const auto deviceIDs = GetMetric(METRIC_KEY(AVAILABLE_DEVICES), {}).as<std::vector<std::string>>();
+        VPU_THROW_UNLESS(!(std::find(deviceIDs.begin(), deviceIDs.end(), deviceName) == deviceIDs.end()), "Myriad device: {} not found.", deviceName);
+    }
+
     const auto log = std::make_shared<Logger>(
         "GraphCompiler",
         parsedConfigCopy.logLevel(),
index c0370c9..3994a67 100644 (file)
@@ -1,7 +1,6 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #include <extension.hpp>
 #include <ngraph/opsets/opset.hpp>
 #include <ngraph/factory.hpp>
index 2314394..ad69466 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index e368038..ea1a9c8 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 168080d..11cc59d 100644 (file)
@@ -5,7 +5,6 @@
 #include <gtest/gtest.h>
 
 #include <cpp/ie_cnn_network.h>
-#include <cnn_network_ngraph_impl.hpp>
 #include <string>
 #include <sstream>
 #include <fstream>
@@ -635,11 +634,11 @@ TEST_F(NGraphReshapeTests, TestInterpParameters) {
     auto ngraph_function = std::make_shared<ngraph::Function>(ngraph::ResultVector{output},
                            ngraph::ParameterVector{inp});
 
-    InferenceEngine::details::CNNNetworkNGraphImpl cnn(ngraph_function);
-    auto icnn = cnn.getCNNNetwork();
+    CNNNetwork cnn(ngraph_function);
+    cnn.begin();
     std::map<std::string, InferenceEngine::SizeVector> inShape;
     inShape["test"] = {1, 3, 4, 5};
-    icnn->reshape(inShape, nullptr);
+    cnn.reshape(inShape);
 }
 
 TEST_F(NGraphReshapeTests, genericNodeWithDynShape) {
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/bf16_network_restoring.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/bf16_network_restoring.cpp
new file mode 100644 (file)
index 0000000..3f02f6c
--- /dev/null
@@ -0,0 +1,211 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class BF16NetworkRestore1 : public BasicBF16Test  {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //   +   Power1(FP32)
+        //        |
+        //   +  AvgPooling1(FP32)
+        //        |
+        //   + Convolution1(BF16)
+        //        |
+        //   +    ReLU1(Fused)
+        //        |------------------------
+        //        |                        \
+        //   +   Convolution2(BF16)      Convolution 3 (BF16)
+        //           |                     /              \
+        //   +        |                  ReLU2(FP32)     Normalize (FP32)
+        //            \              /                      |
+        //              Eltwise (Fused to Conv2)     ------/
+        //                |                         /
+        //              ReLU3  (Fused to Conv2)   /
+        //                |                     /
+        //             MaxPooling1 (FP32)      /
+        //                   \            /
+        //                      Eltwise
+        //                         |
+
+
+        // STAGE1: construction of the GRAPH
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 224, 224});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("Power1");
+
+        // AvgPooling
+        auto avgpoolNode = std::make_shared<opset1::AvgPool>(addNode,
+                                                             Strides{1, 1},
+                                                             Shape{1, 1},
+                                                             Shape{1, 1},
+                                                             Shape{2, 2},
+                                                             true,
+                                                             op::RoundingType::FLOOR);
+        avgpoolNode->set_friendly_name("AvgPooling1");
+
+        // convolution1
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            avgpoolNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("Convolution1");
+
+        // ReLU1
+        auto reluNode = std::make_shared<opset1::Relu>(convNode1);
+        reluNode->set_friendly_name("ReLU1");
+
+        // convolution2
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            reluNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("Convolution2");
+
+        // convolution3
+        std::shared_ptr<ngraph::Node> convNode3 = std::make_shared<ngraph::opset1::Convolution>(
+            reluNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode3->set_friendly_name("Convolution3");
+
+        // ReLU1
+        auto reluNode2 = std::make_shared<opset1::Relu>(convNode3);
+        reluNode2->set_friendly_name("ReLU2");
+
+        // Norm1
+        // normalize
+        const auto axes = make_shared<op::Constant>(element::i64, Shape{2}, vector<int64_t>{2});
+        float eps{1e-6f};
+        auto eps_mode = op::EpsMode::ADD;
+
+        auto normNode =  std::make_shared<opset1::NormalizeL2>(convNode3, axes, eps, eps_mode);
+        normNode->set_friendly_name("Norm1");
+
+
+
+        // Eltwise1
+        auto eltNode1 = std::make_shared<opset1::Add>(convNode2, reluNode2);
+        eltNode1->set_friendly_name("Eltwise1");
+
+        // ReLU3
+        auto reluNode3 = std::make_shared<opset1::Relu>(eltNode1);
+        reluNode3->set_friendly_name("ReLU3");
+
+        // maxPooling1
+        auto maxPoolNode = std::make_shared<opset1::MaxPool>(reluNode3,
+                                                             Strides{1, 1},
+                                                             Shape{1, 1},
+                                                             Shape{0, 0},
+                                                             Shape{2, 2},
+                                                             op::RoundingType::FLOOR);
+        maxPoolNode->set_friendly_name("maxPooling1");
+
+        // Eltwise2
+        auto eltNode2 = std::make_shared<opset1::Add>(maxPoolNode, normNode);
+        eltNode2->set_friendly_name("Eltwise2");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode2}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        threshold = 0.4f;  // max value in the latest tensor for FP32 network is 10.83
+
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["Power1"] = "FP32";
+        expectedPrecisions["AvgPooling1"] = "FP32";
+        expectedPrecisions["Convolution1"] = "BF16";
+        expectedPrecisions["ReLU1"] = "ndef";
+        expectedPrecisions["Convolution2"] = "BF16";
+        expectedPrecisions["Convolution3"] = "BF16";
+        expectedPrecisions["ReLU2"] = "FP32";
+        expectedPrecisions["Norm1"] = "FP32";
+        expectedPrecisions["Eltwise1"] = "ndef";
+        expectedPrecisions["ReLU3"] = "ndef";
+        expectedPrecisions["maxPooling1"] = "FP32";
+        expectedPrecisions["Eltwise2"] = "FP32";
+    }
+};
+
+TEST_P(BF16NetworkRestore1, CompareWithRefImpl) {
+    test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, BF16NetworkRestore1,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 224, 224 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        BF16NetworkRestore1::getTestCaseName);
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/bfloat16_helpers.hpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/bfloat16_helpers.hpp
new file mode 100644 (file)
index 0000000..01e2519
--- /dev/null
@@ -0,0 +1,276 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <inference_engine.hpp>
+#include "ie_common.h"
+#include <ie_blob.h>
+#include <math.h>
+#include <map>
+#include <string>
+#include <utility>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+#include "ngraph/opsets/opset1.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+#include <ie_system_conf.h>
+
+namespace LayerTestsDefinitions {
+
+/**
+ * class providing static helpers for bfloat16 functional tests
+ * using functions you can fill the tensor content by some periodic law or compare
+ *
+ */
+class BFloat16Helpers {
+public:
+    static void fillInputsBySinValues(float* data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = sin(static_cast<float>(i));
+        }
+    }
+
+    static void fillInputsBySinValues(short *data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = reducePrecisionBitwiseS(sin(static_cast<float>(i)));
+        }
+    }
+
+    static void fillInputsByCosValues(float* data, size_t size) {
+        for (size_t i = 0; i < size; i++) {
+            data[i] = cos(static_cast<float>(i));
+        }
+    }
+
+    static int fillInputsBySinValues(InferenceEngine::Blob::Ptr blob) {
+        InferenceEngine::MemoryBlob::Ptr mblob = InferenceEngine::as<InferenceEngine::MemoryBlob>(blob);
+        if (!mblob) {
+            return -1;
+        }
+        if (mblob->getTensorDesc().getPrecision() != InferenceEngine::Precision::FP32) {
+            return -2;
+        }
+        auto lm = mblob->rwmap();
+        fillInputsBySinValues(lm.as<float*>(), mblob->size());
+        return 0;
+    }
+
+    static std::pair<std::string, std::string> matchPerfCountPrecisionVsExpected(
+        const std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>& perfCounts,
+        const std::map<std::string, std::string>& expected) {
+        for (auto e : expected) {
+            auto it = perfCounts.find(e.first);
+            if (it == perfCounts.end()) {
+                return std::pair<std::string, std::string>(e.first, "NOT_FOUND_IN_PERF_COUNTS");
+            }
+            // get the latest n symbols by number of e.second
+            std::string execType = it->second.exec_type;
+            std::string pfPrecision = execType.substr(execType.length() - e.second.length(), e.second.length());
+            if (pfPrecision != e.second) {
+                return std::pair<std::string, std::string>(e.first, pfPrecision);
+            }
+        }
+        return std::pair<std::string, std::string>("", "");
+    }
+
+    static float getMaxAbsValue(const float* data, size_t size) {
+        float maxVal = 0.f;
+        for (size_t i = 0; i < size; i++) {
+            if (fabs(data[i] > maxVal)) {
+                maxVal = fabs(data[i]);
+            }
+        }
+        return maxVal;
+    }
+
+    static float reducePrecisionBitwise(const float in) {
+        float f = in;
+        int* i = reinterpret_cast<int*>(&f);
+        int t2 = *i & 0xFFFF0000;
+        float ft1 = *(reinterpret_cast<float*>(&t2));
+        if ((*i & 0x8000) && (*i & 0x007F0000) != 0x007F0000) {
+            t2 += 0x10000;
+            ft1 = *(reinterpret_cast<float*>(&t2));
+        }
+        return ft1;
+    }
+
+    static short reducePrecisionBitwiseS(const float in) {
+        float f = reducePrecisionBitwise(in);
+        int intf = *reinterpret_cast<int*>(&f);
+        intf = intf >> 16;
+        short s = intf;
+        return s;
+    }
+};
+
+
+typedef std::tuple<
+                   InferenceEngine::Precision,
+                   InferenceEngine::Precision,
+                   InferenceEngine::SizeVector,
+                   InferenceEngine::SizeVector,
+                   std::string> basicParams;
+
+
+/**
+ * Base class for bf16 tests
+ * the flow in this test assume to load network in FP32 and in BF16 modes and verify
+ * 1. difference between outptut's tensor with some treshold.
+ * 2. which preciosion was selected for layers described in runtime info of performance counters
+ *
+ * To develop new test you need to
+ * 1. define class inherriten from  BasicBF16Test and implement SetUp(). For example:
+ *
+ * class ScaleshiftConv_x3_Eltwise : public BasicBF16Test {
+ * protected:
+ * void SetUp()override {
+ *  fnPtr = std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode3}, ngraph::ParameterVector{input1});
+
+        // STAGE1:
+        threshold = 9e-1;
+
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["Add_4"] = "FP32";
+        expectedPrecisions["Convolution_6"] = "BF16";
+        expectedPrecisions["Convolution_7"] = "BF16";
+        expectedPrecisions["Add_8"] = "ndef";
+ *      expectedPrecisions["Convolution_10"] = "BF16";
+ *      }
+ *      };
+ *
+ *  2. define test
+ *  TEST_P(ScaleshiftConv_x3_Eltwise, CompareWithRefImpl) {
+    test();
+};
+ *  3. INSTANTIATE_TEST_CASE_P(bfloat16_NoReshape, ScaleshiftConv_x3_Eltwise,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x3_Eltwise::getTestCaseName);
+
+ *
+ * In 3rd stage do not forget bfloat16 preffix!
+ */
+class BasicBF16Test : public LayerTestsUtils::LayerTestsCommonClass<basicParams> {
+protected:
+    virtual std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision) = 0;
+
+public:
+    std::shared_ptr<ngraph::Function> fnPtr;
+    std::vector<float *> refOut;
+    InferenceEngine::SizeVector inputShapes, newInputShapes;
+    InferenceEngine::SizeVector refOutShape;
+    std::map<std::string, std::string> expectedPrecisions;
+    float threshold = 2e-2;  // Is enough for tensor having abs maximum values less than 1
+
+    static std::string getTestCaseName(testing::TestParamInfo<basicParams> obj) {
+        InferenceEngine::Precision inputPrecision, netPrecision;
+        InferenceEngine::SizeVector inputShapes, newInputShapes;
+        std::string targetDevice;
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = obj.param;
+
+        std::ostringstream result;
+        if (!newInputShapes.empty()) {
+            result << "Reshape_From=" << CommonTestUtils::vec2str(inputShapes);;
+            result << "_To=" << CommonTestUtils::vec2str(newInputShapes) << "_";
+        } else {
+            result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+        }
+        result << "inPRC=" << inputPrecision.name() << "_";
+        result << "netPRC=" << netPrecision.name() << "_";
+        result << "targetDevice=" << targetDevice;
+        return result.str();
+    }
+
+    void test() {
+        if (!InferenceEngine::with_cpu_x86_bfloat16()) {
+            // on platforms which do not support bfloat16, we are disabling bf16 tests since there are no bf16 primitives,
+            // tests are useless on such platforms
+            return;
+        }
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        InferenceEngine::CNNNetwork cnnNet(fnPtr);
+
+        setNetInOutPrecision(cnnNet, inputPrecision);
+        std::string inputName = cnnNet.getInputsInfo().begin()->first;
+        std::string outputName = cnnNet.getOutputsInfo().begin()->first;
+        auto ie = InferenceEngine::Core();
+        // BF16 inference
+        std::map<std::string, std::string> options;
+        if (netPrecision == InferenceEngine::Precision::FP32) {
+            options[InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16] = InferenceEngine::PluginConfigParams::YES;
+        } else {
+            options[InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16] = InferenceEngine::PluginConfigParams::NO;
+        }
+        options[InferenceEngine::PluginConfigParams::KEY_PERF_COUNT] = InferenceEngine::PluginConfigParams::YES;
+        options[InferenceEngine::PluginConfigParams::KEY_DUMP_EXEC_GRAPH_AS_DOT] = "egraph_test";
+
+        auto exec_net1 = ie.LoadNetwork(cnnNet, targetDevice, options);
+        auto req1 = exec_net1.CreateInferRequest();
+
+        InferenceEngine::Blob::Ptr inBlob1 = req1.GetBlob(inputName);
+        BFloat16Helpers::fillInputsBySinValues(inBlob1);
+
+        req1.Infer();
+        auto outBlobBF16 = req1.GetBlob(outputName);
+        InferenceEngine::MemoryBlob::CPtr mout1 = InferenceEngine::as<InferenceEngine::MemoryBlob>(outBlobBF16);
+        ASSERT_NE(mout1, nullptr);
+        auto lm1 = mout1->rmap();
+
+        // FP32 infrence
+        // if netPrecision is not eq to the FP32 - change network precision and recreate network
+        InferenceEngine::CNNNetwork cnnNetFP32(createGraph(InferenceEngine::Precision::FP32));
+        std::string inputNameFP32 = cnnNetFP32.getInputsInfo().begin()->first;
+        std::string outputNameFP32 = cnnNetFP32.getOutputsInfo().begin()->first;
+        setNetInOutPrecision(cnnNetFP32, inputPrecision);
+        auto exec_net2 = ie.LoadNetwork(cnnNetFP32, targetDevice,
+                                        { { InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16, InferenceEngine::PluginConfigParams::NO } });
+        auto req2 = exec_net2.CreateInferRequest();
+
+
+        req2.SetBlob(inputNameFP32, inBlob1);
+
+        req2.Infer();
+        auto outBlobFP32 = req2.GetBlob(outputNameFP32);
+        InferenceEngine::MemoryBlob::CPtr mout2 = InferenceEngine::as<InferenceEngine::MemoryBlob>(outBlobFP32);
+        ASSERT_NE(mout2, nullptr);
+        auto lm2 = mout2->rmap();
+
+        // debug to figure out the maximum value in output tensors:
+        // std::cout << "Max in bfloat16 network by output " << outputName << ": " <<
+        //      BFloat16Helpers::getMaxAbsValue(lm1.as<const float *>(), mout1->size()) << std::endl;
+        // std::cout << "Max in fp32 network by output " << outputNameFP32 << ": " <<
+        //     BFloat16Helpers::getMaxAbsValue(lm2.as<const float *>(), mout2->size()) << std::endl;
+
+        FuncTestUtils::compareRawBuffers(lm1.as<const float *>(),
+                                         lm2.as<const float *>(),
+                                         mout1->size(), mout2->size(),
+                                         threshold);
+
+        // Stage2: verification of performance counters
+        std::pair<std::string, std::string> wrongLayer =
+            BFloat16Helpers::matchPerfCountPrecisionVsExpected(req1.GetPerformanceCounts(), expectedPrecisions);
+        if (wrongLayer.first != std::string("")) {
+            std::string layerInPerfCounts = wrongLayer.first + " " + wrongLayer.second;
+            std::string layerExpected = wrongLayer.first + " " + expectedPrecisions[wrongLayer.first];
+            ASSERT_EQ(layerInPerfCounts, layerExpected);
+        }
+        fnPtr.reset();
+    }
+};
+
+}  // namespace LayerTestsDefinitions
+
+
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_conv.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_conv.cpp
new file mode 100644 (file)
index 0000000..a55c7e1
--- /dev/null
@@ -0,0 +1,119 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ConvConv : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //     ScaleShift (FP32)
+        //          |
+        //        Conv (BF16)
+        //          |
+        //        Conv (BF16)
+
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ngraph::element::f32, ngraph::Shape{1, 3, 40, 40});
+        auto const1 = opset1::Constant::create(ngraph::element::f32, Shape{1}, { 2.0f });
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        auto const2 = opset1::Constant::create(ngraph::element::f32, Shape{1}, { 1.0f });
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        std::vector<float> weightValues;
+        weightValues.resize(3 * 3 * 3 * 3);
+        BFloat16Helpers::fillInputsBySinValues(weightValues.data(), weightValues.size());
+        auto weightsNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, convFilterShape, weightValues);
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        // Convolution
+        ngraph::Shape convFilterShape2 = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        std::vector<float> weightValues2;
+        weightValues2.resize(3 * 3 * 3 * 3);
+        BFloat16Helpers::fillInputsBySinValues(weightValues2.data(), weightValues2.size());
+        auto weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, convFilterShape2, weightValues2);
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            convNode1, weightsNode2,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        // the maximum values in the latest tensor for this test is 24.4. It would be safe to set threshold eq to 0.1
+        threshold = 0.3f;
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "BF16";
+        expectedPrecisions["CONV_2"] = "BF16";
+    }
+};
+
+TEST_P(ConvConv, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ConvConv,
+                        ::testing::Combine(
+                        ::testing::Values(Precision::FP32),
+                        ::testing::Values(Precision::FP32),
+                        ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                        ::testing::Values(SizeVector()),
+                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ConvConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ConvConv,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ConvConv::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_dwconv_relu.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_dwconv_relu.cpp
new file mode 100644 (file)
index 0000000..1ab6f50
--- /dev/null
@@ -0,0 +1,148 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ConvDWConvReLU : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //             scaleshift (FP32)
+        //                |
+        //               Conv (BF16)
+        //                |
+        //            Depthwise Conv (BF16, assuming explicit separte execution of kernel, not fused into prev convolution)
+        //                |
+        //               ReLU (Fused Info DW convolution)
+
+
+        // multiply
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        // DW convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+        ngraph::Shape convFilterShape2 = { 3, 1, 1, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValues2FP32;
+            weightValues2FP32.resize(3 * 1 * 1 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValues2FP32.data(), weightValues2FP32.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2FP32);
+        } else {
+            std::vector<short> weightValues2BF16;
+            weightValues2BF16.resize(3 * 1 * 1 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValues2BF16.data(), weightValues2BF16.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2BF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::GroupConvolution>(
+            convNode1, weightsNode2,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        // ReLU
+        auto reluNode2 =  std::make_shared<opset1::Relu>(convNode2);
+        reluNode2->set_friendly_name("RELU");
+
+        return std::make_shared<ngraph::Function>(reluNode2, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 0.4f;  // maximum value in tensor is 54.89
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "BF16";
+        expectedPrecisions["CONV_2"] = "BF16";
+        expectedPrecisions["RELU"] = "ndef";
+    }
+};
+
+TEST_P(ConvDWConvReLU, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ConvDWConvReLU,
+                            ::testing::Combine(
+                                    ::testing::Values(Precision::FP32),
+                                    ::testing::Values(Precision::FP32),
+                                    ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                    ::testing::Values(SizeVector()),
+                                    ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ConvDWConvReLU::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ConvDWConvReLU,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ConvDWConvReLU::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_relu_pool_conv_relu_pool.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_relu_pool_conv_relu_pool.cpp
new file mode 100644 (file)
index 0000000..8ec2cfd
--- /dev/null
@@ -0,0 +1,199 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ConvReLUPoolConvReLUPool : public BasicBF16Test  {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //    Convolution1  (FP32)
+        //        |
+        //       ReLU1      (Fused)
+        //        |
+        //     Pooling1     (FP32)
+        //        |
+        //    Convolution2  (BF16)
+        //        |
+        //       ReLU2      (Fused)
+        //        |
+        //     Pooling2     (BF16)
+        //        |
+        //    Convolution3  (BF16)
+
+
+        // STAGE1: construction of the GRAPH
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+
+        // convolution1
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode = std::make_shared<ngraph::opset1::Convolution>(
+            input1, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode->set_friendly_name("Convolution_1");
+
+        // ReLU
+        auto reluNode = std::make_shared<opset1::Relu>(convNode);
+        reluNode->set_friendly_name("ReLU_1");
+
+        // Pooling
+        auto avgpoolNode = std::make_shared<opset1::AvgPool>(reluNode,
+                                                             Strides{1, 1},
+                                                             Shape{1, 1},
+                                                             Shape{1, 1},
+                                                             Shape{2, 2},
+                                                             true,
+                                                             op::RoundingType::FLOOR);
+        avgpoolNode->set_friendly_name("AvgPool_1");
+
+        // convolution2
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+        ngraph::Shape convFilterShape2 = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            avgpoolNode, weightsNode2,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("Convolution_2");
+
+        // ReLU
+        auto reluNode2 = std::make_shared<opset1::Relu>(convNode2);
+        reluNode2->set_friendly_name("ReLU_2");
+
+        // Pooling
+        auto maxpoolNode2 = std::make_shared<opset1::MaxPool>(reluNode2,
+                                                             Strides{1, 1},
+                                                             Shape{1, 1},
+                                                             Shape{0, 0},
+                                                             Shape{2, 2},
+                                                             op::RoundingType::FLOOR);
+        maxpoolNode2->set_friendly_name("MaxPool_2");
+
+        // convolution3
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode3 = nullptr;
+        ngraph::Shape convFilterShape3 = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode3 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape3, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode3 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape3, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode3 = std::make_shared<ngraph::opset1::Convolution>(
+            maxpoolNode2, weightsNode3,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode3->set_friendly_name("Convolution_3");
+
+
+
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode3}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override  {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        threshold = 0.2f;  // max value in the latest tensor for FP32 network is 9.8
+
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["Convolution_1"] = "FP32";
+        expectedPrecisions["ReLU_1"] = "ndef";
+        expectedPrecisions["AvgPool_1"] = "FP32";
+        expectedPrecisions["Convolution_2"] = "BF16";
+        expectedPrecisions["ReLU_2"] = "ndef";
+        expectedPrecisions["MaxPool_2"] = "BF16";
+        expectedPrecisions["Convolution_3"] = "BF16";
+    }
+};
+
+TEST_P(ConvReLUPoolConvReLUPool, CompareWithRefImpl) {
+    test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ConvReLUPoolConvReLUPool,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ConvReLUPoolConvReLUPool::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ConvReLUPoolConvReLUPool,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ConvReLUPoolConvReLUPool::getTestCaseName);
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/faster_100_5_1_1_conv.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/faster_100_5_1_1_conv.cpp
new file mode 100644 (file)
index 0000000..efc9c6d
--- /dev/null
@@ -0,0 +1,135 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class Faster100_5_1_1_Conv : public BasicBF16Test  {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //                     Power (FP32)
+        //                       |
+        //                     Convolution (BF16)
+
+        // STAGE1: constructin og the GRAPH
+        // multiply
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{10, 5, 1, 1});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("Add_4");
+
+        // problematic convolution: 100x5x1x1
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 5, 5, 1, 1 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValues;
+            weightValues.resize(5 * 5 * 1 * 1, 0.f);
+            weightValues[0] = 1.0f;
+            weightValues[7] = 1.0f;
+            weightValues[11] = 1.0f;
+            weightValues[19] = 1.0f;
+            weightValues[23] = 1.0f;
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, convFilterShape, weightValues);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(5 * 5 * 1 * 1, BFloat16Helpers::reducePrecisionBitwiseS(0.0f));
+            weightValuesBF16[0] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+            weightValuesBF16[7] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+            weightValuesBF16[11] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+            weightValuesBF16[19] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+            weightValuesBF16[23] = BFloat16Helpers::reducePrecisionBitwiseS(1.0f);
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode->set_friendly_name("Convolution_6");
+
+
+        // ReLU
+        auto reluNode = std::make_shared<opset1::Relu>(convNode);
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{reluNode}, ngraph::ParameterVector{input1});
+    }
+
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["Add_4"] = "FP32";
+        expectedPrecisions["Convolution_6"] = "BF16";
+    }
+};
+
+TEST_P(Faster100_5_1_1_Conv, CompareWithRefImpl) {
+    test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(bfloat16_NoReshape, Faster100_5_1_1_Conv,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(SizeVector({ 10, 5, 1, 1 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                            Faster100_5_1_1_Conv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, Faster100_5_1_1_Conv,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 10, 5, 1, 1 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        Faster100_5_1_1_Conv::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/mobilenet_ssd_with_branching.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/mobilenet_ssd_with_branching.cpp
new file mode 100644 (file)
index 0000000..e8eafcb
--- /dev/null
@@ -0,0 +1,183 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <vector>
+#include <string>
+
+#include <ie_core.hpp>
+#include "functional_test_utils/blob_utils.hpp"
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class MobileNet_ssd_with_branching : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //                scaleshift
+        //                    |
+        //                   Conv1 (FP32)
+        //                  |           \
+        //               Conv2 (FP32 so far while we have not greedy mode. This must be fixed. Such pattern shouild have Conv2 in BF16)
+        //                |              |
+        //               relu(fused)     |
+        //                |          Normalize (not LRN)
+        //           Conv (DW)(BF16)     |
+        //                |              |
+        //               ReLU (Fused)    |
+        //                  \           /
+        //                    Concat
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // Conv1
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        // Conv2
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            convNode1, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        // ReLU
+        auto reluNode =  std::make_shared<opset1::Relu>(convNode2);
+        reluNode->set_friendly_name("RELU_2");
+
+        // DW convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+        ngraph::Shape convFilterShape2 = { 3, 1, 1, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValues2FP32;
+            weightValues2FP32.resize(3 * 1 * 1 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValues2FP32.data(), weightValues2FP32.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2FP32);
+        } else {
+            std::vector<short> weightValues2BF16;
+            weightValues2BF16.resize(3 * 1 * 1 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValues2BF16.data(), weightValues2BF16.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2BF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> dwConvNode = std::make_shared<ngraph::opset1::GroupConvolution>(
+            reluNode, weightsNode2,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        dwConvNode->set_friendly_name("DW_CONV");
+
+        // ReLU
+        auto reluNode2 =  std::make_shared<opset1::Relu>(dwConvNode);
+        reluNode2->set_friendly_name("RELU_DW");
+
+        // normalize
+        const auto axes = make_shared<op::Constant>(element::i64, Shape{2}, vector<int64_t>{2});
+        float eps{1e-6f};
+        auto eps_mode = op::EpsMode::ADD;
+
+        auto normNode =  std::make_shared<opset1::NormalizeL2>(convNode1, axes, eps, eps_mode);
+        normNode->set_friendly_name("NORM_1");
+
+        // Concat
+        ngraph::NodeVector concInputNodes = { reluNode2, normNode };
+        auto concNode = std::make_shared<opset1::Concat>(concInputNodes, 1);
+        concNode->set_friendly_name("CONC_1");
+
+        return std::make_shared<ngraph::Function>(concNode, ngraph::ParameterVector{input1});
+    }
+
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 0.8f;  // max value in latest tensor is 87.67
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "BF16";
+        expectedPrecisions["CONV_2"] = "FP32";
+        expectedPrecisions["RELU_2"] = "ndef";
+        expectedPrecisions["DW_CONV"] = "BF16";
+        expectedPrecisions["RELU_DW"] = "ndef";
+        expectedPrecisions["NORM_1"] = "FP32";
+        expectedPrecisions["CONC_1"] = "FP32";
+    }
+};
+
+TEST_P(MobileNet_ssd_with_branching, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, MobileNet_ssd_with_branching,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        MobileNet_ssd_with_branching::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, MobileNet_ssd_with_branching,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        MobileNet_ssd_with_branching::getTestCaseName);
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_eltwise_conv.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_eltwise_conv.cpp
new file mode 100644 (file)
index 0000000..e165f86
--- /dev/null
@@ -0,0 +1,153 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvEltwiseConv : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //        scaleshift (FP32)     Conv (FP32)
+        //                   \          /
+        //              Eltwise (Fused into Conv)
+        //                |
+        //               Conv (BF16)
+
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            input1, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        // Eltwise, i.e. Add
+        auto eltNode = std::make_shared<opset1::Add>(addNode, convNode1);
+        eltNode->set_friendly_name("ELT_1");
+
+        // Convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+        ngraph::Shape convFilterShape2 = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValues2;
+            weightValues2.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValues2.data(), weightValues2.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2);
+        } else {
+            std::vector<short> weightValues2BF16;
+            weightValues2BF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValues2BF16.data(), weightValues2BF16.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2BF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            eltNode, weightsNode2,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2}, ngraph::ParameterVector{input1});
+    }
+
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 0.2f;  // max value in the latest tensor for FP32 network is 37.77
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "FP32";
+        expectedPrecisions["CONV_2"] = "BF16";
+        expectedPrecisions["ELT_1"] = "ndef";
+    }
+};
+
+TEST_P(ScaleshiftConvEltwiseConv, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvEltwiseConv,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvEltwiseConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvEltwiseConv,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvEltwiseConv::getTestCaseName);
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_eltwise_relu_conv.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_eltwise_relu_conv.cpp
new file mode 100644 (file)
index 0000000..ba7c17b
--- /dev/null
@@ -0,0 +1,159 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvEltwiseReluConv : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //        scaleshift (FP32)     Conv (FP32_
+        //             \          /
+        //               Eltwise (Fused into conv)
+        //                |
+        //               ReLU (Fused into conv)
+        //                |
+        //               Conv (BF16)
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            input1, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        // Eltwise, i.e. Add
+        auto eltNode = std::make_shared<opset1::Add>(addNode, convNode1);
+        eltNode->set_friendly_name("ELT_1");
+
+        // ReLU
+        auto reluNode =  std::make_shared<opset1::Relu>(eltNode);
+        reluNode->set_friendly_name("RELU_1");
+
+        // Convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+        ngraph::Shape convFilterShape2 = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValues2;
+            weightValues2.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValues2.data(), weightValues2.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2);
+        } else {
+            std::vector<short> weightValues2BF16;
+            weightValues2BF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValues2BF16.data(), weightValues2BF16.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValues2BF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            reluNode, weightsNode2,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 9e-2;
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "FP32";
+        expectedPrecisions["CONV_2"] = "BF16";
+        expectedPrecisions["RELU_1"] = "ndef";
+        expectedPrecisions["ELT_1"] = "ndef";
+    }
+};
+
+TEST_P(ScaleshiftConvEltwiseReluConv, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvEltwiseReluConv,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvEltwiseReluConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvEltwiseReluConv,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvEltwiseReluConv::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_eltwise_scaleshift.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_eltwise_scaleshift.cpp
new file mode 100644 (file)
index 0000000..41f81cb
--- /dev/null
@@ -0,0 +1,152 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvEltwiseScaleshift : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //                    scaleshift (FP32)
+        //                        |
+        //                       Conv (BF16)
+        //             \          /
+        //              Eltwise (Fused into Conv)
+        //                |
+        //            scaleshift (FP32)
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        // Eltwise, i.e. Add
+        auto eltNode = std::make_shared<opset1::Add>(input1, convNode1);
+        eltNode->set_friendly_name("ELT_1");
+
+        auto reluNode =  std::make_shared<opset1::Relu>(eltNode);
+        reluNode->set_friendly_name("RELU_1");
+
+        // multiply
+        std::shared_ptr<ngraph::opset1::Constant> const3 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const3 = opset1::Constant::create(ntype, Shape{1}, { 3.0f });
+        } else {
+            const3 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(3.0f)) });
+        }
+        auto mulNode2 = std::make_shared<opset1::Multiply>(reluNode, const3);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const4 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const4 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const4 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto addNode2 = std::make_shared<opset1::Add>(mulNode2, const4);
+        addNode2->set_friendly_name("ADD_2");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{addNode2}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 0.4;
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "BF16";
+        expectedPrecisions["ADD_2"] = "FP32";
+        expectedPrecisions["ELT_1"] = "ndef";
+    }
+};
+
+TEST_P(ScaleshiftConvEltwiseScaleshift, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvEltwiseScaleshift,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvEltwiseScaleshift::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvEltwiseScaleshift,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvEltwiseScaleshift::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_elu_conv.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_elu_conv.cpp
new file mode 100644 (file)
index 0000000..fc63492
--- /dev/null
@@ -0,0 +1,140 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvEluConv : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //              scaleshift (FP32)
+        //                 |
+        //                Conv (BF16)
+        //                |
+        //                Elu (FP32 for now, this must be fixed and it must be fused into Conv)
+        //                 |
+        //                Conv (BF16)
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        // Elu
+        auto eluNode =  std::make_shared<opset1::Elu>(convNode1, 2);
+        eluNode->set_friendly_name("ELU_1");
+
+        // Conv
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            eluNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 1;
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "BF16";
+        expectedPrecisions["ELU_1"] = "FP32";
+        expectedPrecisions["CONV_2"] = "BF16";
+    }
+};
+
+TEST_P(ScaleshiftConvEluConv, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvEluConv,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvEluConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvEluConv,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvEluConv::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_relu.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_relu.cpp
new file mode 100644 (file)
index 0000000..6e5df70
--- /dev/null
@@ -0,0 +1,126 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConvRelu : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //              scaleshift (FP32)
+        //                  |
+        //                Conv (BF16)
+        //                  |
+        //                relu (Fused into convolution)
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        // ReLU
+        auto reluNode =  std::make_shared<opset1::Relu>(convNode1);
+        reluNode->set_friendly_name("RELU_1");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{reluNode}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 5e-2;
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "BF16";
+        expectedPrecisions["RELU_1"] = "ndef";
+    }
+};
+
+TEST_P(ScaleshiftConvRelu, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConvRelu,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvRelu::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConvRelu,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConvRelu::getTestCaseName);
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_concat_relu.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_concat_relu.cpp
new file mode 100644 (file)
index 0000000..11dcb1e
--- /dev/null
@@ -0,0 +1,148 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x2_ConcatRelu : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //              scaleshift
+        //             /         \
+        //           Conv      Conv
+        //             \       /
+        //              concat
+        //                |
+        //               relu
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        // Concat
+        ngraph::NodeVector concInputNodes = { convNode1, convNode2 };
+
+        // test is to be failed, if axis == 1 - TODO
+        auto concNode = std::make_shared<opset1::Concat>(concInputNodes, 2);
+        concNode->set_friendly_name("CONC_1");
+
+        // ReLU
+        auto reluNode =  std::make_shared<opset1::Relu>(concNode);
+        reluNode->set_friendly_name("RELU_1");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{reluNode}, ngraph::ParameterVector{input1});
+    }
+
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 10e-1;
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "BF16";
+        expectedPrecisions["CONV_2"] = "BF16";
+        expectedPrecisions["CONC_1"] = "FP32";
+        expectedPrecisions["RELU_1"] = "FP32";
+    }
+};
+
+TEST_P(ScaleshiftConv_x2_ConcatRelu, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x2_ConcatRelu,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x2_ConcatRelu::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x2_ConcatRelu,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x2_ConcatRelu::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_eltwise.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_eltwise.cpp
new file mode 100644 (file)
index 0000000..82340d2
--- /dev/null
@@ -0,0 +1,137 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x2_Eltwise : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //              scaleshift (FP32)
+        //             /             \
+        //           Conv1 (BF16)     Conv1 (BF16)
+        //             \               /
+        //                eltwise (Fused into Conv1) produce FP32 output
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        // Eltwise, i.e. Add
+        auto eltNode = std::make_shared<opset1::Add>(convNode1, convNode2);
+        eltNode->set_friendly_name("ELT_1");
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode}, ngraph::ParameterVector{input1});
+    }
+
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 2e-1;
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "BF16";
+        expectedPrecisions["CONV_2"] = "BF16";
+        expectedPrecisions["ELT_1"] = "ndef";
+    }
+};
+
+TEST_P(ScaleshiftConv_x2_Eltwise, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x2_Eltwise,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x2_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x2_Eltwise,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x2_Eltwise::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_mixed1_eltwise.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_mixed1_eltwise.cpp
new file mode 100644 (file)
index 0000000..86bcc52
--- /dev/null
@@ -0,0 +1,137 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x2_mixed1_Eltwise : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //         scaleshift (FP32)
+        //             |               |
+        //      Conv1(BF16)       Conv2(FP32)
+        //             \       /
+        //            eltwise(Fused into Conv1)
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            input1, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        // Eltwise, i.e. Add
+        auto eltNode = std::make_shared<opset1::Add>(convNode1, convNode2);
+        eltNode->set_friendly_name("ELT_1");
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode}, ngraph::ParameterVector{input1});
+    }
+
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 2e-1;
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["ADD_1"] = "FP32";
+        expectedPrecisions["CONV_1"] = "BF16";
+        expectedPrecisions["CONV_2"] = "FP32";
+        expectedPrecisions["ELT_1"] = "ndef";
+    }
+};
+
+TEST_P(ScaleshiftConv_x2_mixed1_Eltwise, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x2_mixed1_Eltwise,
+                            ::testing::Combine(
+                                    ::testing::Values(Precision::FP32),
+                                    ::testing::Values(Precision::FP32),
+                                    ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                    ::testing::Values(SizeVector()),
+                                    ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x2_mixed1_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x2_mixed1_Eltwise,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x2_mixed1_Eltwise::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_mixed2_eltwise.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x2_mixed2_eltwise.cpp
new file mode 100644 (file)
index 0000000..1e483bf
--- /dev/null
@@ -0,0 +1,138 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x2_mixed2_Eltwise : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //                   scaleshift (FP32)
+        //             |         |
+        //     Conv1 (FP32)      Conv2 (Bf16)
+        //             \       /
+        //             eltwise (Fused into Conv1)
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            input1, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("CONV_1");
+
+        // multiply
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("ADD_2");
+
+        // convolution
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("CONV_2");
+
+        // Eltwise, i.e. Add
+        auto eltNode = std::make_shared<opset1::Add>(convNode1, convNode2);
+        eltNode->set_friendly_name("ELT_1");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode}, ngraph::ParameterVector{input1});
+    }
+
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 2e-1;
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["CONV_1"] = "FP32";
+        expectedPrecisions["ADD_2"] = "FP32";
+        expectedPrecisions["CONV_2"] = "BF16";
+        expectedPrecisions["ELT_1"] = "ndef";
+    }
+};
+
+TEST_P(ScaleshiftConv_x2_mixed2_Eltwise, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x2_mixed2_Eltwise,
+                            ::testing::Combine(
+                                    ::testing::Values(Precision::FP32),
+                                    ::testing::Values(Precision::FP32),
+                                    ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                    ::testing::Values(SizeVector()),
+                                    ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x2_mixed2_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x2_mixed2_Eltwise,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x2_mixed2_Eltwise::getTestCaseName);
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x3_eltwise.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_conv_x3_eltwise.cpp
new file mode 100644 (file)
index 0000000..361dc01
--- /dev/null
@@ -0,0 +1,170 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class ScaleshiftConv_x3_Eltwise : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //        scaleshift (FP32)
+        //
+        //        /        \
+        //
+        //       Conv1 (BF16)    Conv2 (BF16)
+        //
+        //        \        /
+        //
+        //        Eltwise (Fused to Conv1)
+        //
+        //          |
+        //
+        //         Conv3 (BF16)
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("Add_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 16, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(16 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(16 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("Convolution_1");
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("Convolution_2");
+
+        // Eltwise, i.e. Add
+        auto eltNode = std::make_shared<opset1::Add>(convNode1, convNode2);
+        eltNode->set_friendly_name("ELT_1");
+
+
+        // Convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode3 = nullptr;
+        ngraph::Shape convFilterShape3 = { 16, 16, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(16 * 16 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode3 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape3, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(16 * 16 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode3 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape3, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode3 = std::make_shared<ngraph::opset1::Convolution>(
+            eltNode, weightsNode3,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode3->set_friendly_name("Convolution_3");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode3}, ngraph::ParameterVector{input1});
+    }
+
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 1.0f;  // max value in the latest tensor for FP32 network is 93.3
+
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["Add_1"] = "FP32";
+        expectedPrecisions["Convolution_1"] = "BF16";
+        expectedPrecisions["Convolution_2"] = "BF16";
+        expectedPrecisions["ELT_1"] = "ndef";
+        expectedPrecisions["Convolution_3"] = "BF16";
+    }
+};
+
+TEST_P(ScaleshiftConv_x3_Eltwise, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, ScaleshiftConv_x3_Eltwise,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x3_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, ScaleshiftConv_x3_Eltwise,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        ScaleshiftConv_x3_Eltwise::getTestCaseName);
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x2_conv_x2_eltwise.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x2_conv_x2_eltwise.cpp
new file mode 100644 (file)
index 0000000..3daece1
--- /dev/null
@@ -0,0 +1,160 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class Scaleshift_x2_Conv_x2_Eltwise : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //
+        //  scaleshift (FP32)    scaleshift (FP32)
+        //        \             /      \
+        //        Eltwise  (FP32)   Conv (BF16)
+        //          |                 |
+        //                          Conv (BF16)
+        //                            |
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("Add_1");
+
+        // multiply
+        std::shared_ptr<ngraph::opset1::Constant> const3 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const3 = opset1::Constant::create(ntype, Shape{1}, { 3.0f });
+        } else {
+            const3 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(3.0f)) });
+        }
+        auto mulNode2 = std::make_shared<opset1::Multiply>(input1, const3);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const4 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const4 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const4 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto addNode2 = std::make_shared<opset1::Add>(mulNode2, const4);
+        addNode2->set_friendly_name("Add_2");
+
+        // Eltwise, i.e. Add
+        auto eltNode = std::make_shared<opset1::Add>(addNode, addNode2);
+        eltNode->set_friendly_name("ELT_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode2, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("Convolution_1");
+
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            convNode1, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("Convolution_2");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{eltNode, convNode2}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 1;
+
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["Add_1"] = "FP32";
+        expectedPrecisions["Add_2"] = "FP32";
+        expectedPrecisions["Convolution_1"] = "BF16";
+        expectedPrecisions["Convolution_2"] = "BF16";
+        expectedPrecisions["ELT_1"] = "FP32";
+    }
+};
+
+TEST_P(Scaleshift_x2_Conv_x2_Eltwise, CompareWithRefImpl) {
+    test();
+};
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, Scaleshift_x2_Conv_x2_Eltwise,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        Scaleshift_x2_Conv_x2_Eltwise::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, Scaleshift_x2_Conv_x2_Eltwise,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        Scaleshift_x2_Conv_x2_Eltwise::getTestCaseName);
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x3_conv_eltwise_relu.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/scaleshift_x3_conv_eltwise_relu.cpp
new file mode 100644 (file)
index 0000000..3c22d18
--- /dev/null
@@ -0,0 +1,182 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <functional>
+#include <map>
+#include <utility>
+
+#include <ie_core.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class Scaleshift_x3_ConvEltwiseRelu : public BasicBF16Test {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //
+        //   scaleshift (FP32)
+        //        |
+        //       Conv (BF16)   scaleshift (FP32
+        //
+        //         \         /
+        //
+        //        Eltwise (Fused to Conv)
+        //          |
+        //         ReLU   (Fused to Conv)
+        //           |
+        //       scaleshift  (FP32)
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("Add_1");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 3, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(3 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode1 = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 1, 1 }),  // pad begin
+            ngraph::CoordinateDiff({ 1, 1 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode1->set_friendly_name("Convolution_1");
+
+        // multiply
+        std::shared_ptr<ngraph::opset1::Constant> const3 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const3 = opset1::Constant::create(ntype, Shape{1}, { 3.0f });
+        } else {
+            const3 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(3.0f)) });
+        }
+        auto mulNode2 = std::make_shared<opset1::Multiply>(input1, const3);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const4 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const4 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const4 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto addNode2 = std::make_shared<opset1::Add>(mulNode2, const4);
+        addNode2->set_friendly_name("Add_2");
+
+        // Eltwise, i.e. Add
+        auto eltNode = std::make_shared<opset1::Add>(convNode1, addNode2);
+        eltNode->set_friendly_name("ELT_1");
+
+        // ReLU
+        auto reluNode =  std::make_shared<opset1::Relu>(eltNode);
+        reluNode->set_friendly_name("RELU_1");
+
+        // multiply
+        std::shared_ptr<ngraph::opset1::Constant> const5 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const5 = opset1::Constant::create(ntype, Shape{1}, { 4.0f });
+        } else {
+            const5 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(4.0f)) });
+        }
+        auto mulNode3 = std::make_shared<opset1::Multiply>(reluNode, const5);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const6 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const6 = opset1::Constant::create(ntype, Shape{1}, { 3.0f });
+        } else {
+            const6 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(3.0f)) });
+        }
+        auto addNode3 = std::make_shared<opset1::Add>(mulNode3, const6);
+        addNode3->set_friendly_name("Add_3");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{addNode3}, ngraph::ParameterVector{input1});
+    }
+
+    void SetUp()override {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        // STAGE1:
+        threshold = 2e-1;
+
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["Add_1"] = "FP32";
+        expectedPrecisions["Convolution_1"] = "BF16";
+        expectedPrecisions["Add_2"] = "FP32";
+        expectedPrecisions["ELT_1"] = "ndef";
+        expectedPrecisions["RELU_1"] = "ndef";
+        expectedPrecisions["Add_3"] = "FP32";
+    }
+};
+
+    TEST_P(Scaleshift_x3_ConvEltwiseRelu, CompareWithRefImpl) {
+        test();
+    };
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, Scaleshift_x3_ConvEltwiseRelu,
+                        ::testing::Combine(
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(Precision::FP32),
+                                ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                                ::testing::Values(SizeVector()),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        Scaleshift_x3_ConvEltwiseRelu::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, Scaleshift_x3_ConvEltwiseRelu,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        Scaleshift_x3_ConvEltwiseRelu::getTestCaseName);
+
+
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/tail_fp32_optimization.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/tail_fp32_optimization.cpp
new file mode 100644 (file)
index 0000000..3a99203
--- /dev/null
@@ -0,0 +1,141 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class PoolingAfterConv : public BasicBF16Test  {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //    Scaleshift   (FP32)
+        //        |
+        //    Convolution  (BF16)
+        //        |
+        //       ReLU      (Fused)
+        //        |
+        //     Pooling     (FP32) <- this layer can be be executed in bf16 if it passes data to next bf16 layer
+        //                           in other case there should be tail optimization and return Pooling to FP32
+
+        // STAGE1: construction of the GRAPH
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("Add_4");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 16, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(16 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(16 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode->set_friendly_name("Convolution_6");
+
+        // ReLU
+        auto reluNode = std::make_shared<opset1::Relu>(convNode);
+
+        // Pooling
+        auto avgpoolNode = std::make_shared<opset1::AvgPool>(reluNode,
+                                                             Strides{1, 1},
+                                                             Shape{1, 1},
+                                                             Shape{1, 1},
+                                                             Shape{2, 2},
+                                                             true,
+                                                             op::RoundingType::FLOOR);
+        avgpoolNode->set_friendly_name("AvgPool_8");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{avgpoolNode}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override  {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        threshold = 0.14f;  // max value in the latest tensor for FP32 network is 14.6448
+
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["Add_4"] = "FP32";
+        expectedPrecisions["Convolution_6"] = "BF16";
+        expectedPrecisions["AvgPool_8"] = "FP32";
+    }
+};
+
+TEST_P(PoolingAfterConv, CompareWithRefImpl) {
+    test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, PoolingAfterConv,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                            PoolingAfterConv::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, PoolingAfterConv,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        PoolingAfterConv::getTestCaseName);
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/bfloat16/topk_inputs_i32.cpp b/inference-engine/tests/functional/plugin/cpu/bfloat16/topk_inputs_i32.cpp
new file mode 100644 (file)
index 0000000..ef704c0
--- /dev/null
@@ -0,0 +1,165 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "bfloat16_helpers.hpp"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+#include <string>
+#include <map>
+#include <functional>
+#include <utility>
+
+#include <ie_core.hpp>
+#include <ie_plugin_config.hpp>
+
+#include "common_test_utils/common_utils.hpp"
+
+#include "ngraph/opsets/opset1.hpp"
+
+using namespace std;
+using namespace ngraph;
+using namespace InferenceEngine;
+
+namespace LayerTestsDefinitions {
+
+class TopKInputsI32 : public BasicBF16Test  {
+protected:
+    std::shared_ptr<ngraph::Function> createGraph(InferenceEngine::Precision netPrecision)override {
+        //      Power   (FP32)
+        //        |
+        //      Convolution1 (BF16)       Const (I32)
+        //               |                |
+        //               \                /
+        //                  TopK (FP32)
+        //              (BF16)/        \ (I32)
+        //                   |
+        //         Convolution 2
+
+        // STAGE1: construction of the GRAPH
+
+        ngraph::element::Type ntype = (netPrecision == Precision::FP32) ? ngraph::element::f32 : ngraph::element::bf16;
+        // multiply
+        auto input1 = std::make_shared<opset1::Parameter>(ntype, ngraph::Shape{1, 3, 40, 40});
+        input1->set_friendly_name("Input_1");
+        std::shared_ptr<ngraph::opset1::Constant> const1 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { 2.0f });
+        } else {
+            const1 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(2.0f)) });
+        }
+        auto mulNode = std::make_shared<opset1::Multiply>(input1, const1);
+
+        // add
+        std::shared_ptr<ngraph::opset1::Constant> const2 = nullptr;
+        if (netPrecision == Precision::FP32) {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { 1.0f });
+        } else {
+            const2 = opset1::Constant::create(ntype, Shape{1}, { bfloat16::from_bits(BFloat16Helpers::reducePrecisionBitwiseS(1.0f)) });
+        }
+        auto addNode = std::make_shared<opset1::Add>(mulNode, const2);
+        addNode->set_friendly_name("Add_4");
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode = nullptr;
+        ngraph::Shape convFilterShape = { 16, 3, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(16 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(16 * 3 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode = std::make_shared<ngraph::opset1::Convolution>(
+            addNode, weightsNode,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode->set_friendly_name("Convolution_1");
+
+        // TopK
+        const auto k = make_shared<op::Constant>(element::i32, Shape{}, vector<int>{1});
+        size_t axis = 1;
+        ngraph::op::v1::TopK::Mode mode = ngraph::op::v1::TopK::Mode::MAX;
+        ngraph::op::v1::TopK::SortType sort = ngraph::op::v1::TopK::SortType::NONE;
+        auto argmaxNode = std::make_shared<opset1::TopK>(convNode, k, axis, mode, sort);
+        argmaxNode->set_friendly_name("TopK_1");
+
+        auto goe0 = make_shared<op::GetOutputElement>(argmaxNode, 0);
+        auto goe1 = make_shared<op::GetOutputElement>(argmaxNode, 1);
+
+        // convolution
+        std::shared_ptr<ngraph::opset1::Constant> weightsNode2 = nullptr;
+        ngraph::Shape convFilterShape2 = { 1, 1, 3, 3 };  // out channel, /input channels, kernel h, kernel w
+        if (netPrecision == Precision::FP32) {
+            std::vector<float> weightValuesFP32;
+            weightValuesFP32.resize(1 * 1 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesFP32.data(), weightValuesFP32.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValuesFP32);
+        } else {
+            std::vector<short> weightValuesBF16;
+            weightValuesBF16.resize(1 * 1 * 3 * 3);
+            BFloat16Helpers::fillInputsBySinValues(weightValuesBF16.data(), weightValuesBF16.size());
+            weightsNode2 = std::make_shared<ngraph::opset1::Constant>(ntype, convFilterShape2, weightValuesBF16.data());
+        }
+
+        std::shared_ptr<ngraph::Node> convNode2 = std::make_shared<ngraph::opset1::Convolution>(
+            goe0, weightsNode2,
+            ngraph::Strides({ 1, 1 }),   // strides
+            ngraph::CoordinateDiff({ 0, 0 }),  // pad begin
+            ngraph::CoordinateDiff({ 0, 0 }),   // pad end
+            ngraph::Strides({ 1, 1 }),        // dilation
+            ngraph::op::PadType::EXPLICIT);   // pad type
+        convNode2->set_friendly_name("Convolution_2");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{convNode2, goe1}, ngraph::ParameterVector{input1});
+    }
+    void SetUp()override  {
+        std::tie(inputPrecision, netPrecision, inputShapes, newInputShapes, targetDevice) = this->GetParam();
+        fnPtr = createGraph(netPrecision);
+
+        threshold = 0.14f;  // max value in the latest tensor for FP32 network is 22.6
+
+        // STAGE2:
+        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // performance counters
+        expectedPrecisions["Add_4"] = "FP32";
+        expectedPrecisions["Convolution_1"] = "BF16";
+        expectedPrecisions["Convolution_2"] = "BF16";
+        expectedPrecisions["TopK_1"] = "FP32";
+    }
+};
+
+TEST_P(TopKInputsI32, CompareWithRefImpl) {
+    test();
+};
+
+
+INSTANTIATE_TEST_CASE_P(FP32_bfloat16_NoReshape, TopKInputsI32,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        TopKInputsI32::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(BF16_bfloat16_NoReshape, TopKInputsI32,
+                        ::testing::Combine(
+                            ::testing::Values(Precision::FP32),
+                            ::testing::Values(Precision::BF16),
+                            ::testing::Values(SizeVector({ 1, 3, 40, 40 })),
+                            ::testing::Values(SizeVector()),
+                            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                        TopKInputsI32::getTestCaseName);
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reshape.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reshape.cpp
new file mode 100644 (file)
index 0000000..d0976a7
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/reshape.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+// Common params
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::U8
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::FP16
+};
+
+INSTANTIATE_TEST_CASE_P(ReshapeCheckDynBatch, ReshapeLayerTest,
+        ::testing::Combine(
+                ::testing::Values(true),
+                ::testing::ValuesIn(inputPrecisions),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(std::vector<size_t>({30, 30, 30, 30})),
+                ::testing::Values(std::vector<size_t>({30, 30, 30, 30})),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                ::testing::Values(std::map<std::string, std::string>({{CONFIG_KEY(DYN_BATCH_ENABLED), CONFIG_VALUE(YES)}}))),
+                ReshapeLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(ReshapeCheck, ReshapeLayerTest,
+        ::testing::Combine(
+                ::testing::Values(true),
+                ::testing::ValuesIn(inputPrecisions),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(std::vector<size_t>({10, 10, 10, 10})),
+                ::testing::Values(std::vector<size_t>({10, 0, 100})),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                ::testing::Values(std::map<std::string, std::string>({}))),
+                ReshapeLayerTest::getTestCaseName);
+}  // namespace
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/softmax.cpp b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/softmax.cpp
new file mode 100644 (file)
index 0000000..09ddf01
--- /dev/null
@@ -0,0 +1,51 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/softmax.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+    InferenceEngine::Precision::FP32,
+};
+
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+    InferenceEngine::Precision::FP32,
+};
+
+const std::vector<InferenceEngine::Layout> inputLayouts2D = {
+    InferenceEngine::Layout::NC,
+};
+
+const std::vector<InferenceEngine::SizeVector> inputShapes2D = {
+    InferenceEngine::SizeVector {1, 100},
+};
+
+const std::vector<size_t> axis2D = {
+    1
+};
+
+const auto params2D = testing::Combine(
+    testing::ValuesIn(netPrecisions),
+    testing::ValuesIn(inputPrecisions),
+    testing::ValuesIn(inputLayouts2D),
+    testing::ValuesIn(inputShapes2D),
+    testing::ValuesIn(axis2D),
+    testing::Values(CommonTestUtils::DEVICE_CPU),
+    testing::Values(std::map<std::string, std::string>())
+);
+
+INSTANTIATE_TEST_CASE_P(
+    SoftMax2D,
+    SoftMaxLayerTest,
+    params2D,
+    SoftMaxLayerTest::getTestCaseName
+);
+
+}  // namespace
index fdd7696..efc4ef9 100644 (file)
@@ -6,13 +6,17 @@
 set(TARGET_NAME gpuFuncTests)
 
 addIeTargetTest(
-        NAME ${TARGET_NAME}
-        ROOT ${CMAKE_CURRENT_SOURCE_DIR}
+        NAME
+            ${TARGET_NAME}
+        ROOT
+            ${CMAKE_CURRENT_SOURCE_DIR}
         DEPENDENCIES
             clDNNPlugin
         LINK_LIBRARIES
             funcSharedTests
+            ${CLDNN__IOCL_ICD_LIBPATH}
         ADD_CPPLINT
         LABELS
             GPU
-)
\ No newline at end of file
+)
+target_include_directories(${TARGET_NAME} PRIVATE ${CLDNN__IOCL_ICD_INCDIRS})
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp b/inference-engine/tests/functional/plugin/gpu/remote_blob_tests/cldnn_remote_blob_tests.cpp
new file mode 100644 (file)
index 0000000..aa139fc
--- /dev/null
@@ -0,0 +1,258 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <string>
+#include <utility>
+#include <vector>
+#include <memory>
+
+#include <cpp/ie_cnn_net_reader.h>
+#include <inference_engine.hpp>
+#include <ie_compound_blob.h>
+
+#include <cldnn/cldnn_config.hpp>
+
+#ifdef _WIN32
+# include <gpu/gpu_context_api_dx.hpp>
+#elif defined ENABLE_LIBVA
+# include <gpu/gpu_context_api_va.hpp>
+#endif
+#include <gpu/gpu_context_api_ocl.hpp>
+#include <common_test_utils/test_common.hpp>
+#include <functional_test_utils/plugin_cache.hpp>
+
+#include "ngraph_functions/subgraph_builders.hpp"
+#include "functional_test_utils/blob_utils.hpp"
+
+using namespace ::testing;
+using namespace InferenceEngine;
+using namespace InferenceEngine::gpu;
+
+struct OpenCL {
+    cl::Context _context;
+    cl::Device _device;
+    cl::CommandQueue _queue;
+
+    explicit OpenCL(std::shared_ptr<std::vector<cl_context_properties>> media_api_context_properties = nullptr) {
+        // get Intel iGPU OCL device, create context and queue
+        {
+            const unsigned int refVendorID = 0x8086;
+            cl_uint n = 0;
+            cl_int err = clGetPlatformIDs(0, NULL, &n);
+
+            // Get platform list
+            std::vector<cl_platform_id> platform_ids(n);
+            err = clGetPlatformIDs(n, platform_ids.data(), NULL);
+
+            for (auto& id : platform_ids) {
+                cl::Platform platform = cl::Platform(id);
+                std::vector<cl::Device> devices;
+                platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
+                for (auto& d : devices) {
+                    if (refVendorID == d.getInfo<CL_DEVICE_VENDOR_ID>()) {
+                        _device = d;
+                        _context = cl::Context(_device);
+                        break;
+                    }
+                }
+            }
+            cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+            _queue = cl::CommandQueue(_context, _device, props);
+        }
+    }
+
+    explicit OpenCL(cl_context context) {
+        // user-supplied context handle
+        _context = cl::Context(context, true);
+        _device = cl::Device(_context.getInfo<CL_CONTEXT_DEVICES>()[0].get(), true);
+
+        cl_command_queue_properties props = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+        _queue = cl::CommandQueue(_context, _device, props);
+    }
+};
+
+class RemoteBlob_Test : public CommonTestUtils::TestsCommon {
+protected:
+    std::shared_ptr<ngraph::Function> fn_ptr;
+    virtual void SetUp() {
+        fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    }
+};
+
+TEST_F(RemoteBlob_Test, canInputUserBlob) {
+#if defined(_WIN32) || defined(ANDROID)
+    GTEST_SKIP();
+#endif
+    CNNNetwork net(fn_ptr);
+
+    net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+    // TODO: Issue: investigate issue with IECore
+    auto ie = InferenceEngine::Core();
+    auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+    // regular inference
+    auto inf_req_regular = exec_net.CreateInferRequest();
+    InferenceEngine::Blob::Ptr fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+    inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+    inf_req_regular.Infer();
+    auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // inference using remote blob
+    auto inf_req_shared = exec_net.CreateInferRequest();
+    auto cldnn_context = exec_net.GetContext();
+    cl_context ctx = std::dynamic_pointer_cast<ClContext>(cldnn_context)->get();
+    auto ocl_instance = std::make_shared<OpenCL>(ctx);
+    cl_int err;
+
+    auto dims = net.getInputsInfo().begin()->second->getTensorDesc().getDims();
+    size_t imSize = dims[1] * dims[2] * dims[3];
+
+    cl::Buffer shared_buffer(ocl_instance->_context, CL_MEM_READ_WRITE, imSize, NULL, &err);
+    {
+        void* buffer = fakeImageData->buffer();
+        ocl_instance->_queue.enqueueWriteBuffer(shared_buffer, true, 0, imSize, buffer);
+    }
+
+    Blob::Ptr shared_blob = make_shared_blob(net.getInputsInfo().begin()->second->getTensorDesc(), cldnn_context, shared_buffer);
+    inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, shared_blob);
+
+    inf_req_shared.Infer();
+    auto outputBlob_shared = inf_req_shared.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // compare results
+    {
+        ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+        ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size());
+        auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+        FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr);
+    }
+}
+
+TEST_F(RemoteBlob_Test, canInferOnUserContext) {
+#if defined _WIN32
+    GTEST_SKIP();
+#endif
+    auto fn_ptr = ngraph::builder::subgraph::makeSplitMultiConvConcat();
+    CNNNetwork net(fn_ptr);
+
+    net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+    net.getInputsInfo().begin()->second->setPrecision(Precision::U8);
+
+    auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+
+    auto ie = InferenceEngine::Core();
+    auto exec_net_regular = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU);
+
+    // regular inference
+    auto inf_req_regular = exec_net_regular.CreateInferRequest();
+    auto fakeImageData = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+    inf_req_regular.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+    inf_req_regular.Infer();
+    auto outputBlob_regular = inf_req_regular.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // inference using remote blob
+    auto ocl_instance = std::make_shared<OpenCL>();
+    auto remote_context = make_shared_context(ie, CommonTestUtils::DEVICE_GPU, ocl_instance->_context.get());
+    auto exec_net_shared = ie.LoadNetwork(net, remote_context);
+    auto inf_req_shared = exec_net_shared.CreateInferRequest();
+    inf_req_shared.SetBlob(net.getInputsInfo().begin()->first, fakeImageData);
+
+    inf_req_shared.Infer();
+    auto outputBlob_shared = inf_req_shared.GetBlob(net.getOutputsInfo().begin()->first);
+
+    // compare results
+    {
+        ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+        ASSERT_EQ(outputBlob_regular->size(), outputBlob_shared->size());
+        auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+        FuncTestUtils::compareBlobs(outputBlob_regular, outputBlob_shared, thr);
+    }
+}
+
+class TwoNets_Test : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<size_t> {
+    void SetUp() override {
+        num_streams = this->GetParam();
+        fn_ptrs = {ngraph::builder::subgraph::makeSplitMultiConvConcat(),
+                   ngraph::builder::subgraph::makeMultiSingleConv()};
+    };
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<std::size_t> & obj) {
+        return "num_streams_" + std::to_string(obj.param);
+    }
+protected:
+    size_t num_streams;
+    std::vector<std::shared_ptr<ngraph::Function>> fn_ptrs;
+};
+
+TEST_P(TwoNets_Test, canInferTwoExecNets) {
+    std::vector<InferenceEngine::CNNNetwork> nets;
+    for (auto &fn_ptr : fn_ptrs) {
+        nets.push_back(CNNNetwork(fn_ptr));
+    }
+
+    auto ie = InferenceEngine::Core();
+
+    std::vector<std::string> outputs;
+    std::vector<InferRequest> irs;
+    std::vector<std::shared_ptr<float*>> ref;
+    std::vector<int> outElementsCount;
+
+    for (size_t i = 0; i < nets.size(); ++i) {
+        auto net = nets[i];
+
+        net.getInputsInfo().begin()->second->setLayout(Layout::NCHW);
+        net.getInputsInfo().begin()->second->setPrecision(Precision::FP32);
+
+        auto exec_net = ie.LoadNetwork(net, CommonTestUtils::DEVICE_GPU,
+                               {{PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, std::to_string(num_streams)}});
+
+        for (int j = 0; j < num_streams; j++) {
+            outputs.push_back(net.getOutputsInfo().begin()->first);
+
+            auto inf_req = exec_net.CreateInferRequest();
+            irs.push_back(inf_req);
+
+            auto blob = FuncTestUtils::createAndFillBlob(net.getInputsInfo().begin()->second->getTensorDesc());
+            inf_req.SetBlob(net.getInputsInfo().begin()->first, blob);
+
+            outElementsCount.push_back(std::accumulate(begin(fn_ptrs[i]->get_output_shape(0)), end(fn_ptrs[i]->get_output_shape(0)), 1,
+                                                       std::multiplies<size_t>()));
+
+            std::shared_ptr<float*> reOutData = ngraph::helpers::inferFnWithInterp<ngraph::element::Type_t::f32>(
+                    fn_ptrs[i], {inf_req.GetBlob(net.getInputsInfo().begin()->first)->buffer()}).front();
+            ref.push_back(reOutData);
+        }
+    }
+
+    const int niter = 10;
+    for (int i = 0; i < niter; i++) {
+        for (auto ir : irs) {
+            ir.StartAsync();
+        }
+
+        for (auto ir : irs) {
+            ir.Wait(IInferRequest::RESULT_READY);
+        }
+    }
+
+    for (auto& net : nets) {
+        ASSERT_EQ(net.getOutputsInfo().begin()->second->getPrecision(), InferenceEngine::Precision::FP32);
+    }
+    auto thr = FuncTestUtils::GetComparisonThreshold(InferenceEngine::Precision::FP32);
+    for (size_t i = 0; i < irs.size(); ++i) {
+        ASSERT_EQ(outElementsCount[i], irs[i].GetBlob(outputs[i])->size());
+        FuncTestUtils::compareRawBuffers(irs[i].GetBlob(outputs[i])->buffer().as<float*>(), *ref[i], outElementsCount[i],
+                                         outElementsCount[i],
+                                         thr);
+    }
+}
+
+const std::vector<size_t> num_strems{1, 2};
+
+INSTANTIATE_TEST_CASE_P(RemoteBlob, TwoNets_Test, ::testing::ValuesIn(num_strems), TwoNets_Test::getTestCaseName);
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reshape.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/reshape.cpp
new file mode 100644 (file)
index 0000000..6f858ac
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include "single_layer_tests/reshape.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+// Common params
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+            InferenceEngine::Precision::FP32,
+            InferenceEngine::Precision::U8
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+            InferenceEngine::Precision::FP32,
+            InferenceEngine::Precision::FP16
+};
+
+//TODO: Issue : - 28981
+INSTANTIATE_TEST_CASE_P(DISABLE_ReshapeCheckDynBatch, ReshapeLayerTest,
+        ::testing::Combine(
+                ::testing::Values(true),
+                ::testing::ValuesIn(inputPrecisions),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(std::vector<size_t>({1, 16, 16, 16})),
+                ::testing::Values(std::vector<size_t>({1, 0, 256})),
+                 ::testing::Values(CommonTestUtils::DEVICE_GPU),
+                ::testing::Values(std::map<std::string, std::string>({{CONFIG_KEY(DYN_BATCH_ENABLED), CONFIG_VALUE(YES)}}))),
+                ReshapeLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(ReshapeCheck, ReshapeLayerTest,
+        ::testing::Combine(
+                ::testing::Values(true),
+                ::testing::ValuesIn(inputPrecisions),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(std::vector<size_t>({10, 10, 10, 10})),
+                ::testing::Values(std::vector<size_t>({10, 0, 100})),
+                ::testing::Values(CommonTestUtils::DEVICE_GPU),
+                ::testing::Values(std::map<std::string, std::string>({}))),
+                ReshapeLayerTest::getTestCaseName);
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/strided_slice.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/strided_slice.cpp
new file mode 100644 (file)
index 0000000..348bf9b
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+
+#include "single_layer_tests/strided_slice.hpp"
+#include "common_test_utils/test_constants.hpp"
+
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+stridedSliceParamsTuple ss_only_test_cases[] = {
+        stridedSliceParamsTuple({ 2, 2, 2, 2 }, { 0, 0, 0, 0 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 },
+                       {1, 1, 1, 1}, {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},
+                                InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+                                CommonTestUtils::DEVICE_GPU),
+        stridedSliceParamsTuple({ 2, 2, 2, 2 }, { 1, 1, 1, 1 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 },
+                       {0, 0, 0, 0}, {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},
+                                InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+                                CommonTestUtils::DEVICE_GPU),
+        stridedSliceParamsTuple({ 2, 2, 2, 2 }, { 1, 1, 1, 1 }, { 2, 2, 2, 2 }, { 1, 1, 1, 1 },
+                       {0, 0, 0, 0}, {0, 0, 0, 0},  {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},
+                                InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+                                CommonTestUtils::DEVICE_GPU),
+        stridedSliceParamsTuple({ 2, 2, 4, 3 }, { 0, 0, 0, 0 }, { 2, 2, 4, 3 }, { 1, 1, 2, 1 },
+                       {1, 1, 1, 1}, {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},
+                                InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+                                CommonTestUtils::DEVICE_GPU),
+        stridedSliceParamsTuple({ 2, 2, 4, 2 }, { 1, 0, 0, 1 }, { 2, 2, 4, 2 }, { 1, 1, 2, 1 },
+                       {0, 1, 1, 0}, {1, 1, 0, 0},  {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},
+                                InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+                                CommonTestUtils::DEVICE_GPU),
+        stridedSliceParamsTuple({ 1, 2, 4, 2 }, { 1, 0, 0, 0 }, { 1, 2, 4, 2 }, { 1, 1, -2, -1 },
+                       {1, 1, 1, 1}, {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},
+                                InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+                                CommonTestUtils::DEVICE_GPU),
+        stridedSliceParamsTuple({ 2, 2, 4, 2 }, { 1, 0, 0, 0 }, { 1, 2, 4, 2 }, { 1, 1, -2, -1 },
+                       {0, 1, 1, 1}, {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},  {1, 1, 1, 1},
+                                InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32,
+                                CommonTestUtils::DEVICE_GPU),
+};
+
+INSTANTIATE_TEST_CASE_P(
+        smoke_CLDNN, StridedSliceLayerTest, ::testing::ValuesIn(ss_only_test_cases),
+        StridedSliceLayerTest::getTestCaseName);
+
+
+}  // namespace
index 9b430ac..62daa07 100644 (file)
@@ -1,19 +1,23 @@
-# Copyright (C) 2019 Intel Corporation
+# Copyright (C) 2019-2020 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 
 set(TARGET_NAME myriadFuncTests)
 
+disable_deprecated_warnings()
+
 addIeTargetTest(
         NAME ${TARGET_NAME}
         ROOT ${CMAKE_CURRENT_SOURCE_DIR}
         DEPENDENCIES
             myriadPlugin
         LINK_LIBRARIES
+            vpu_common_lib
+            vpu_graph_transformer
             funcSharedTests
         ADD_CPPLINT
         LABELS
             VPU
             MYRIAD
-)
\ No newline at end of file
+)
diff --git a/inference-engine/tests/functional/plugin/myriad/ngraph/conversions/dynamic_shape_resolver.cpp b/inference-engine/tests/functional/plugin/myriad/ngraph/conversions/dynamic_shape_resolver.cpp
new file mode 100644 (file)
index 0000000..0bdf012
--- /dev/null
@@ -0,0 +1,63 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/operations/dynamic_shape_resolver.hpp"
+
+#include "ngraph/op/parameter.hpp"
+#include "ngraph/function.hpp"
+
+#include "cpp/ie_cnn_network.h"
+#include "ie_common.h"
+
+#include "common_test_utils/test_common.hpp"
+
+#include <gtest/gtest.h>
+
+namespace {
+
+class DynamicShapeResolverTests : public CommonTestUtils::TestsCommon {
+public:
+    void SetUp() override {
+        const auto tensorType  = ngraph::element::f16;
+        const auto shapeType   = ngraph::element::u64;
+        const auto tensorShape = std::initializer_list<std::size_t>{1, 800};
+
+        const auto tensor = std::make_shared<ngraph::op::Parameter>(tensorType, ngraph::Shape{tensorShape});
+        const auto shape  = std::make_shared<ngraph::op::Parameter>(shapeType, ngraph::Shape{tensorShape.size()});
+        auto dynamicShapeResolver = std::make_shared<ngraph::op::DynamicShapeResolver>(tensor, shape);
+        dynamicShapeResolver->set_friendly_name(s_FriendlyName);
+        const auto function = std::make_shared<ngraph::Function>(ngraph::NodeVector{dynamicShapeResolver}, ngraph::ParameterVector{tensor, shape});
+
+        cnnNetwork = InferenceEngine::CNNNetwork{function};
+        triggerConversionToCNNNetwork();
+    }
+
+protected:
+    InferenceEngine::CNNLayerPtr getDynamicShapeResolverLayer() const {
+        return cnnNetwork.getLayerByName(s_FriendlyName.c_str());
+    }
+    InferenceEngine::CNNNetwork cnnNetwork;
+
+private:
+    void triggerConversionToCNNNetwork() {
+        cnnNetwork.begin();
+    }
+
+    static const std::string s_FriendlyName;
+};
+
+const std::string DynamicShapeResolverTests::s_FriendlyName = "DSR";
+
+TEST_F(DynamicShapeResolverTests, NGraphFunctionCanBeConvertedToCNNNetwork) {
+    ASSERT_EQ(cnnNetwork.getInputsInfo().size(), 2);
+    ASSERT_EQ(cnnNetwork.layerCount(), cnnNetwork.getInputsInfo().size() + 1);
+    ASSERT_EQ(cnnNetwork.getOutputsInfo().size(), 1);
+
+    const auto dynamicShapeResolver = getDynamicShapeResolverLayer();
+    ASSERT_EQ(dynamicShapeResolver->type, "DynamicShapeResolver");
+    ASSERT_EQ(dynamicShapeResolver->insData.size(), 2);
+    ASSERT_EQ(dynamicShapeResolver->outData.size(), 1);
+}
+
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/myriad/ngraph/operations/dynamic_shape_resolver.cpp b/inference-engine/tests/functional/plugin/myriad/ngraph/operations/dynamic_shape_resolver.cpp
new file mode 100644 (file)
index 0000000..84d2112
--- /dev/null
@@ -0,0 +1,158 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <ngraph/op/parameter.hpp>
+#include <ngraph/function.hpp>
+
+#include <gtest/gtest.h>
+#include <common_test_utils/test_common.hpp>
+#include <details/ie_exception.hpp>
+
+#include "vpu/ngraph/operations/dynamic_shape_resolver.hpp"
+
+namespace {
+
+using DataType  = ngraph::element::Type_t;
+using DimsType  = ngraph::element::Type_t;
+using DataShape = ngraph::Shape;
+
+class DynamicShapeResolverTests : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<std::tuple<DataType, DimsType, DataShape>> {
+public:
+    void SetUp() override {
+        const auto& parameters = GetParam();
+        const auto& dataType   = std::get<0>(parameters);
+        const auto& dimsType   = std::get<1>(parameters);
+        const auto& dataShape  = std::get<2>(parameters);
+
+        data = std::make_shared<ngraph::op::Parameter>(dataType, dataShape);
+        dims = std::make_shared<ngraph::op::Parameter>(dimsType, ngraph::Shape{dataShape.size()});
+    }
+
+protected:
+    std::shared_ptr<ngraph::op::Parameter> data;
+    std::shared_ptr<ngraph::op::Parameter> dims;
+};
+
+TEST_P(DynamicShapeResolverTests, CanValidateAndInferTypes) {
+    std::shared_ptr<ngraph::op::DynamicShapeResolver> dynamicShapeResolver;
+    ASSERT_NO_THROW(dynamicShapeResolver = std::make_shared<ngraph::op::DynamicShapeResolver>(data, dims));
+    ASSERT_NO_THROW(std::make_shared<ngraph::Function>(ngraph::NodeVector{dynamicShapeResolver}, ngraph::ParameterVector{data, dims}));
+}
+
+std::set<ngraph::element::Type_t> allNGraphTypes() {
+    return {
+        ngraph::element::dynamic,
+        ngraph::element::boolean,
+        ngraph::element::bf16,
+        ngraph::element::f16,
+        ngraph::element::f32,
+        ngraph::element::f64,
+        ngraph::element::i8,
+        ngraph::element::i16,
+        ngraph::element::i32,
+        ngraph::element::i64,
+        ngraph::element::u1,
+        ngraph::element::u8,
+        ngraph::element::u16,
+        ngraph::element::u32,
+        ngraph::element::u64
+    };
+}
+
+std::set<ngraph::element::Type_t> allNGraphIntegralNumberTypes() {
+    return {
+        ngraph::element::i8,
+        ngraph::element::i16,
+        ngraph::element::i32,
+        ngraph::element::i64,
+        ngraph::element::u1,
+        ngraph::element::u8,
+        ngraph::element::u16,
+        ngraph::element::u32,
+        ngraph::element::u64
+    };
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicShapeResolverTests, testing::Combine(
+    testing::ValuesIn(allNGraphTypes()),
+    testing::ValuesIn(allNGraphIntegralNumberTypes()),
+    testing::Values(DataShape{1, 800}, DataShape{1, 1})));
+
+
+using DataPartialShape = ngraph::PartialShape;
+using DimsPartialShape = ngraph::PartialShape;
+class DynamicShapeResolverNegativeTests
+    : public CommonTestUtils::TestsCommon
+    , public testing::WithParamInterface<std::tuple<DataType, DimsType, DataPartialShape, DimsPartialShape>> {
+public:
+    void SetUp() override {
+        const auto& parameters = GetParam();
+        const auto& dataType   = std::get<0>(parameters);
+        const auto& dimsType   = std::get<1>(parameters);
+        const auto& dataPartialShape  = std::get<2>(parameters);
+        const auto& dimsPartialShape  = std::get<3>(parameters);
+
+        data = std::make_shared<ngraph::op::Parameter>(dataType, dataPartialShape);
+        dims = std::make_shared<ngraph::op::Parameter>(dimsType, dimsPartialShape);
+    }
+
+protected:
+    std::shared_ptr<ngraph::op::Parameter> data;
+    std::shared_ptr<ngraph::op::Parameter> dims;
+};
+
+class DynamicShapeResolverNegativeTestsDimsType : public DynamicShapeResolverNegativeTests {};
+TEST_P(DynamicShapeResolverNegativeTestsDimsType, ThrowsOnInvalidDimsType) {
+    ASSERT_THROW(std::make_shared<ngraph::op::DynamicShapeResolver>(data, dims), ngraph::ngraph_error);
+}
+
+std::set<ngraph::element::Type_t> allNGraphNotIntegralTypes() {
+    auto notIntegralTypes = std::set<ngraph::element::Type_t>{};
+    const auto& allTypes = allNGraphTypes();
+    const auto& allIntegralTypes = allNGraphIntegralNumberTypes();
+    std::set_difference(allTypes.cbegin(), allTypes.cend(), allIntegralTypes.cbegin(), allIntegralTypes.cend(),
+        std::inserter(notIntegralTypes, notIntegralTypes.begin()));
+    return notIntegralTypes;
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicShapeResolverNegativeTestsDimsType, testing::Combine(
+    testing::ValuesIn(allNGraphTypes()),
+    testing::ValuesIn(allNGraphNotIntegralTypes()),
+    testing::Values(DataPartialShape{1, 800}),
+    testing::Values(DataPartialShape{2})));
+
+class DynamicShapeResolverNegativeTestsDataShape : public DynamicShapeResolverNegativeTests {};
+TEST_P(DynamicShapeResolverNegativeTestsDataShape, ThrowsOnInvalidDimsType) {
+    ASSERT_THROW(std::make_shared<ngraph::op::DynamicShapeResolver>(data, dims), ngraph::ngraph_error);
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicShapeResolverNegativeTestsDataShape, testing::Combine(
+    testing::ValuesIn(allNGraphTypes()),
+    testing::ValuesIn(allNGraphIntegralNumberTypes()),
+    testing::Values(
+        DataPartialShape::dynamic(),
+        DataPartialShape{{1, ngraph::Dimension::dynamic()}},
+        DataPartialShape{{ngraph::Dimension::dynamic(), 1}},
+        DataPartialShape{{ngraph::Dimension::dynamic(), ngraph::Dimension::dynamic()}}),
+    testing::Values(DataShape{2})));
+
+class DynamicShapeResolverNegativeTestsDimsShape : public DynamicShapeResolverNegativeTests {};
+TEST_P(DynamicShapeResolverNegativeTestsDimsShape, ThrowsOnInvalidDimsType) {
+    ASSERT_THROW(std::make_shared<ngraph::op::DynamicShapeResolver>(data, dims), ngraph::ngraph_error);
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicShapeResolverNegativeTestsDimsShape, testing::Combine(
+    testing::ValuesIn(allNGraphTypes()),
+    testing::ValuesIn(allNGraphIntegralNumberTypes()),
+    testing::Values(DataShape{1, 800}),
+    testing::Values(
+        DataPartialShape::dynamic(),
+        DataPartialShape{{1, ngraph::Dimension::dynamic()}},
+        DataPartialShape{{ngraph::Dimension::dynamic(), 1}},
+        DataPartialShape{{ngraph::Dimension::dynamic(), ngraph::Dimension::dynamic()}},
+        DataPartialShape{0},
+        DataPartialShape{1},
+        DataPartialShape{3})));
+
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/myriad/ngraph/operations/static_shape_nonzero.cpp b/inference-engine/tests/functional/plugin/myriad/ngraph/operations/static_shape_nonzero.cpp
new file mode 100644 (file)
index 0000000..45ba484
--- /dev/null
@@ -0,0 +1,111 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/operations/static_shape_nonzero.hpp"
+
+#include <common_test_utils/test_common.hpp>
+
+#include <ngraph/op/parameter.hpp>
+#include <ngraph/function.hpp>
+
+#include <details/ie_exception.hpp>
+
+#include <gtest/gtest.h>
+
+namespace {
+
+using TensorType  = ngraph::element::Type;
+using TensorShape = ngraph::PartialShape;
+
+class StaticShapeNonZeroTests
+        : public CommonTestUtils::TestsCommon,
+          public testing::WithParamInterface<std::tuple<TensorType, TensorShape>> {
+public:
+    void SetUp() override {
+        const auto& parameters  = GetParam();
+        const auto& tensorType  = std::get<0>(parameters);
+        const auto& tensorShape = std::get<1>(parameters);
+
+        m_param = std::make_shared<ngraph::op::Parameter>(tensorType, tensorShape);
+    }
+protected:
+    std::shared_ptr<ngraph::op::Parameter> m_param;
+};
+
+std::vector<ngraph::PartialShape> testStaticShapes {
+        TensorShape{1000},
+        TensorShape{4, 1000},
+        TensorShape{3, 128, 256},
+        TensorShape{2, 3, 128, 256},
+};
+
+std::vector<ngraph::PartialShape> testDynamicShapes {
+        TensorShape{ngraph::Dimension::dynamic()},
+        TensorShape{4, ngraph::Dimension::dynamic()},
+        TensorShape{3, ngraph::Dimension::dynamic(), 256},
+};
+
+std::vector<ngraph::element::Type> testNGraphNumericTypes {
+        ngraph::element::dynamic,
+        ngraph::element::bf16,
+        ngraph::element::f16,
+        ngraph::element::f32,
+        ngraph::element::f64,
+        ngraph::element::i8,
+        ngraph::element::i16,
+        ngraph::element::i32,
+        ngraph::element::i64,
+        ngraph::element::u1,
+        ngraph::element::u8,
+        ngraph::element::u16,
+        ngraph::element::u32,
+        ngraph::element::u64,
+};
+
+//
+// Positive tests
+//
+
+TEST_P(StaticShapeNonZeroTests, CanValidateAndInferTypes) {
+    std::shared_ptr<ngraph::op::StaticShapeNonZero> op;
+    ASSERT_NO_THROW(op = std::make_shared<ngraph::op::StaticShapeNonZero>(m_param));
+    ASSERT_NO_THROW(std::make_shared<ngraph::Function>(
+            ngraph::OutputVector{op->output(0), op->output(1)},
+            ngraph::ParameterVector{m_param}));
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, StaticShapeNonZeroTests, testing::Combine(
+        testing::ValuesIn(testNGraphNumericTypes),
+        testing::ValuesIn(testStaticShapes))
+);
+
+//
+// Negative tests
+//
+
+class StaticShapeNonZeroTestsNegativeDataType : public StaticShapeNonZeroTests {};
+TEST_P(StaticShapeNonZeroTestsNegativeDataType, ThrowsOnInvalidDataType) {
+    std::shared_ptr<ngraph::op::StaticShapeNonZero> op;
+    ASSERT_THROW(op = std::make_shared<ngraph::op::StaticShapeNonZero>(m_param),
+                 ngraph::NodeValidationFailure);
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, StaticShapeNonZeroTestsNegativeDataType, testing::Combine(
+        testing::Values(ngraph::element::boolean),
+        testing::ValuesIn(testStaticShapes))
+);
+
+class StaticShapeNonZeroTestsNegativeDataShape : public StaticShapeNonZeroTests {};
+TEST_P(StaticShapeNonZeroTestsNegativeDataShape, ThrowsOnInvalidDataShape) {
+    std::shared_ptr<ngraph::op::StaticShapeNonZero> op;
+    ASSERT_THROW(op = std::make_shared<ngraph::op::StaticShapeNonZero>(m_param),
+                 ngraph::NodeValidationFailure);
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, StaticShapeNonZeroTestsNegativeDataShape, testing::Combine(
+        testing::ValuesIn(testNGraphNumericTypes),
+        testing::ValuesIn(testDynamicShapes))
+);
+
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/myriad/ngraph/transformations/dynamic_to_static_shape_nonzero.cpp b/inference-engine/tests/functional/plugin/myriad/ngraph/transformations/dynamic_to_static_shape_nonzero.cpp
new file mode 100644 (file)
index 0000000..51c090d
--- /dev/null
@@ -0,0 +1,110 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "vpu/ngraph/transformations/dynamic_to_static_shape_nonzero.hpp"
+#include "vpu/ngraph/operations/static_shape_nonzero.hpp"
+#include "vpu/ngraph/operations/dynamic_shape_resolver.hpp"
+
+#include "../utils/ngraph_utils.h"
+
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset1.hpp>
+#include <ngraph/opsets/opset3.hpp>
+
+#include <cpp/ie_cnn_network.h>
+
+#include <common_test_utils/test_common.hpp>
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+#include <map>
+#include <vector>
+
+namespace {
+
+using TensorType  = ngraph::element::Type_t;
+using TensorShape = ngraph::Shape;
+
+class DynamicToStaticShapeNonZeroTests
+        : public CommonTestUtils::TestsCommon,
+          public testing::WithParamInterface<std::tuple<TensorType, TensorShape>> {
+public:
+    void prepareFunctions() {
+        const auto& parameters = GetParam();
+        const auto& tensorType = std::get<0>(parameters);
+        const auto& tensorShape = std::get<1>(parameters);
+
+        // Create a function with only opset3::NonZero
+        // And then run conversion pass
+        {
+            const auto input = std::make_shared<ngraph::op::Parameter>(tensorType, tensorShape);
+
+            const auto nonZero = std::make_shared<ngraph::opset3::NonZero>(input);
+            nonZero->set_friendly_name(s_FriendlyName);
+
+            m_resfunction = std::make_shared<ngraph::Function>(
+                    ngraph::NodeVector{nonZero}, ngraph::ParameterVector{input});
+            ngraph::pass::DynamicToStaticShapeNonZero().run_on_function(m_resfunction);
+        }
+
+        // Create a reference function
+        {
+            const auto input = std::make_shared<ngraph::opset1::Parameter>(tensorType, tensorShape);
+
+            const auto staticShapeNonZero = std::make_shared<ngraph::op::StaticShapeNonZero>(input);
+            staticShapeNonZero->set_friendly_name(s_FriendlyName + "/static_shape");
+            const auto dynamicShapeResolver = std::make_shared<ngraph::op::DynamicShapeResolver>(
+                    staticShapeNonZero->output(0), staticShapeNonZero->output(1));
+            dynamicShapeResolver->set_friendly_name(s_FriendlyName + "/resolve_shape");
+
+            m_refFunction = std::make_shared<ngraph::Function>(
+                    ngraph::NodeVector{dynamicShapeResolver}, ngraph::ParameterVector{input});
+        }
+    }
+
+    void compareFunctions() {
+        FuncTestUtils::CompareFunctions(m_resfunction, m_refFunction);
+
+        auto actualResultNode = m_resfunction->get_output_op(0);
+        auto actualResolverNode = actualResultNode->input(0).get_source_output().get_node_shared_ptr();
+        auto actualNonZeroNode = actualResolverNode->input(0).get_source_output().get_node_shared_ptr();
+
+        auto expectedResultNode = m_refFunction->get_output_op(0);
+        auto expectedResolverNode = expectedResultNode->input(0).get_source_output().get_node_shared_ptr();
+        auto expectedNonZeroNode = expectedResolverNode->input(0).get_source_output().get_node_shared_ptr();
+
+        EXPECT_EQ(actualResolverNode->get_friendly_name(), expectedResolverNode->get_friendly_name());
+        EXPECT_EQ(actualNonZeroNode->get_friendly_name(), expectedNonZeroNode->get_friendly_name());
+    }
+
+protected:
+    std::shared_ptr<ngraph::Function> m_resfunction;
+    std::shared_ptr<ngraph::Function> m_refFunction;
+
+    static const std::string s_FriendlyName;
+};
+
+const std::string DynamicToStaticShapeNonZeroTests::s_FriendlyName = "non_zero";
+
+TEST_P(DynamicToStaticShapeNonZeroTests, inferAndValidate) {
+    prepareFunctions();
+    compareFunctions();
+}
+
+INSTANTIATE_TEST_CASE_P(NGraph, DynamicToStaticShapeNonZeroTests, testing::Combine(
+        testing::Values(
+                ngraph::element::f16,
+                ngraph::element::f32,
+                ngraph::element::i32,
+                ngraph::element::i64,
+                ngraph::element::u8),
+        testing::Values(
+                TensorShape{1000},
+                TensorShape{4, 1000},
+                TensorShape{3, 128, 256},
+                TensorShape{2, 3, 128, 256})
+));
+
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/myriad/ngraph/utils/ngraph_utils.h b/inference-engine/tests/functional/plugin/myriad/ngraph/utils/ngraph_utils.h
new file mode 100644 (file)
index 0000000..dd5cf87
--- /dev/null
@@ -0,0 +1,67 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/function.hpp>
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <queue>
+#include <string>
+
+namespace FuncTestUtils {
+
+using ComparingNodesPair = typename std::pair<std::shared_ptr<ngraph::Node>, std::shared_ptr<ngraph::Node>>;
+using ComparingNodesBFSQueue = typename std::queue<ComparingNodesPair>;
+
+//
+// This function compares two nGraph functions and requires them to have exactly one output
+// Check nodes types
+// Check number of inputs
+// Check shapes of each Node
+//
+void CompareFunctions(const std::shared_ptr<ngraph::Function>& fActual,
+                      const std::shared_ptr<ngraph::Function>& fExpected) {
+    const auto fActualResults = fActual->get_results();
+    const auto fExpectedResults = fExpected->get_results();
+
+    ASSERT_EQ(fActualResults.size(), 1);
+    ASSERT_EQ(fExpectedResults.size(), 1);
+
+    const auto typeInfoToStr = [](const ngraph::Node::type_info_t& typeInfo) {
+        return std::string(typeInfo.name) + "/" + std::to_string(typeInfo.version);
+    };
+
+    ComparingNodesBFSQueue comparingNodes;
+    comparingNodes.push({fActualResults[0], fExpectedResults[0]});
+    while (!comparingNodes.empty()) {
+        const auto node1 = comparingNodes.front().first;
+        const auto node2 = comparingNodes.front().second;
+        comparingNodes.pop();
+
+        ASSERT_EQ(node1->get_type_info(), node2->get_type_info())
+                                    << "Functions compare: data types must be equal "
+                                    << typeInfoToStr(node1->get_type_info()) << " != "
+                                    << typeInfoToStr(node2->get_type_info());
+
+        ASSERT_EQ(node1->inputs().size(), node2->inputs().size())
+                                    << "Functions compare: numbers of inputs are different: "
+                                    << node1->inputs().size() << " and " << node2->inputs().size();
+
+        for (int i = 0; i < node1->inputs().size(); ++i) {
+            const auto partialShape1 = node1->input(i).get_partial_shape();
+            const auto partialShape2 = node2->input(i).get_partial_shape();
+            ASSERT_TRUE(partialShape1.relaxes(partialShape2) && partialShape1.refines(partialShape2))
+                                        << "Functions compare: Different shape detected "
+                                        << partialShape1 << " and " << partialShape2;
+
+            comparingNodes.push({node1->input_value(i).get_node_shared_ptr(),
+                                 node2->input_value(i).get_node_shared_ptr()});
+        }
+    }
+}
+
+}  // namespace FuncTestUtils
diff --git a/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/nonzero.cpp b/inference-engine/tests/functional/plugin/myriad/shared_tests_instances/single_layer_tests/nonzero.cpp
new file mode 100644 (file)
index 0000000..57acc05
--- /dev/null
@@ -0,0 +1,44 @@
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/nonzero.hpp"
+
+#include "common_test_utils/test_constants.hpp"
+#include <vpu/vpu_plugin_config.hpp>
+#include <vpu/private_plugin_config.hpp>
+
+#include <vector>
+
+using namespace ngraph::helpers;
+using namespace LayerTestsDefinitions;
+
+namespace {
+
+std::vector<std::vector<size_t>> inShapes = {
+        {1000},
+        {4, 1000},
+        {2, 4, 1000},
+};
+
+const std::vector<InferenceEngine::Precision> inputPrecisions = {
+        InferenceEngine::Precision::I32,
+        InferenceEngine::Precision::FP16,
+        InferenceEngine::Precision::U8,
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP16
+};
+
+// Enable this when #-29056 is ready
+INSTANTIATE_TEST_CASE_P(DISABLED_nonzero, NonZeroLayerTest,
+        ::testing::Combine(
+                ::testing::ValuesIn(inShapes),
+                ::testing::ValuesIn(inputPrecisions),
+                ::testing::ValuesIn(netPrecisions),
+                ::testing::Values(CommonTestUtils::DEVICE_MYRIAD),
+                ::testing::Values(ConfigMap({{VPU_CONFIG_KEY(DETECT_NETWORK_BATCH), CONFIG_VALUE(NO)}}))),
+         NonZeroLayerTest::getTestCaseName);
+}  // namespace
index 891bcb7..1af0d1d 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -26,6 +26,7 @@ public:
 
 protected:
     void SetUp() override;
+    void TearDown() override;
 };
 
 }  // namespace LayerTestsDefinitions
index 5d48007..296e35f 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -26,12 +26,11 @@ typedef std::tuple<
         InferenceEngine::Precision,
         InferenceEngine::Precision,
         InferenceEngine::SizeVector,
-        std::string> convLayerTestParamsSet;
+        LayerTestsUtils::TargetDevice> convLayerTestParamsSet;
 namespace LayerTestsDefinitions {
 
 
-class ConvolutionLayerTest
-        : public LayerTestsUtils::LayerTestsCommonClass<convLayerTestParamsSet> {
+class ConvolutionLayerTest : public testing::WithParamInterface<convLayerTestParamsSet>, public LayerTestsUtils::FuncTestsCommon {
 public:
     static std::string getTestCaseName(testing::TestParamInfo<convLayerTestParamsSet> obj);
 
@@ -39,4 +38,4 @@ protected:
     void SetUp() override;
 };
 
-}  // namespace LayerTestsDefinitions
\ No newline at end of file
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/nonzero.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/nonzero.hpp
new file mode 100644 (file)
index 0000000..399ff1a
--- /dev/null
@@ -0,0 +1,38 @@
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "functional_test_utils/layer_test_utils.hpp"
+
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace LayerTestsDefinitions {
+
+using ConfigMap = typename std::map<std::string, std::string>;
+
+using NonZeroLayerTestParamsSet = typename std::tuple<
+        InferenceEngine::SizeVector,          // Input shapes
+        InferenceEngine::Precision,           // Input precision
+        InferenceEngine::Precision,           // Network precision
+        std::string,                          // Device name
+        ConfigMap>;                           // Config map
+
+class NonZeroLayerTest
+        : public LayerTestsUtils::LayerTestsCommonClass<NonZeroLayerTestParamsSet> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<NonZeroLayerTestParamsSet> obj);
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/reshape.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/reshape.hpp
new file mode 100644 (file)
index 0000000..76c7365
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+#include "functional_test_utils/layer_test_utils.hpp"
+
+namespace LayerTestsDefinitions {
+    typedef std::tuple<
+            bool,                               // SpecialZero
+            InferenceEngine::Precision,         // Input precision
+            InferenceEngine::Precision,         // Network precision
+            std::vector<size_t>,                // Input shapes
+            std::vector<size_t>,                // OutForm Shapes
+            std::string,                        // Device name
+            std::map<std::string, std::string>  // Config
+            > reshapeParams;
+
+class ReshapeLayerTest
+        : public LayerTestsUtils::LayerTestsCommonClass<reshapeParams> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<reshapeParams> obj);
+protected:
+    void SetUp() override;
+};
+
+}  // namespace LayerTestsDefinitions
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/softmax.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/softmax.hpp
new file mode 100644 (file)
index 0000000..93613fd
--- /dev/null
@@ -0,0 +1,40 @@
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "functional_test_utils/layer_test_utils.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+namespace LayerTestsDefinitions {
+
+using softMaxLayerTestParams =
+    std::tuple<
+        InferenceEngine::Precision,         // netPrecision
+        InferenceEngine::Precision,         // inputPrecision
+        InferenceEngine::Layout,            // inputLayout
+        InferenceEngine::SizeVector,        // inputShape
+        size_t,                             // axis
+        std::string,                        // targetDevice
+        std::map<std::string, std::string>  // config
+    >;
+
+class SoftMaxLayerTest :
+        public LayerTestsUtils::LayerTestsCommonClass<softMaxLayerTestParams> {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<softMaxLayerTestParams> obj);
+
+protected:
+    void SetUp() override;
+};
+
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/strided_slice.hpp b/inference-engine/tests/functional/plugin/shared/include/single_layer_tests/strided_slice.hpp
new file mode 100644 (file)
index 0000000..206ff44
--- /dev/null
@@ -0,0 +1,37 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "functional_test_utils/layer_test_utils.hpp"
+
+namespace LayerTestsDefinitions {
+
+using stridedSliceParamsTuple = typename std::tuple<
+        InferenceEngine::SizeVector,       // Input shape
+        std::vector<int64_t>,              // Begin
+        std::vector<int64_t>,              // End
+        std::vector<int64_t>,              // Stride
+        std::vector<int64_t>,              // Begin mask
+        std::vector<int64_t>,              // End mask
+        std::vector<int64_t>,              // New axis mask
+        std::vector<int64_t>,              // Shrink axis mask
+        std::vector<int64_t>,              // Ellipsis axis mask
+        InferenceEngine::Precision,        // Input precision
+        InferenceEngine::Precision,        // Network precision
+        std::string>;                      // Device name>;
+
+class StridedSliceLayerTest : public LayerTestsUtils::LayerTestsCommonClass<stridedSliceParamsTuple> {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<stridedSliceParamsTuple> &obj);
+
+protected:
+    void SetUp() override;
+};
+}  // namespace LayerTestsDefinitions
index 2d9602f..ac1c574 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -53,6 +53,12 @@ void ExecGraphUniqueNodeNames::SetUp() {
     fnPtr = std::make_shared<ngraph::Function>(results, params, "SplitConvConcat");
 }
 
+void ExecGraphUniqueNodeNames::TearDown() {
+    if (targetDevice.find(CommonTestUtils::DEVICE_GPU) != std::string::npos) {
+        PluginCache::get().reset();
+    }
+}
+
 TEST_P(ExecGraphUniqueNodeNames, CheckUniqueNodeNames) {
     InferenceEngine::CNNNetwork cnnNet(fnPtr);
 
index 31a061f..9147242 100644 (file)
@@ -1,4 +1,5 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
+//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -33,9 +34,9 @@ TEST_P(PluginSpecificConversion, GeluConversionTest) {
         // Parameter->Activation->Output
         ASSERT_EQ(net.layerCount(), 3);
     } else if (device == "GPU") {
-        // Parameter--->ScaleShift-------------->Eltwise-->Result
+        // Parameter--->ScaleShift-------------->Eltwise
         //          `-->ScaleShift->ScaleShift-`
-        ASSERT_EQ(net.layerCount(), 6);
+        ASSERT_EQ(net.layerCount(), 5);
     }
 }
 
index 889472b..1adf2b8 100644 (file)
@@ -1,4 +1,5 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
+//
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -107,6 +108,9 @@ TEST_P(ActivationLayerTest, CompareWithRefs) {
                                      outElementsCount,
                                      thr);
     fnPtr.reset();
+    if (targetDevice.find(CommonTestUtils::DEVICE_GPU) != std::string::npos) {
+        PluginCache::get().reset();
+    }
 }
 
 }  // namespace LayerTestsDefinitions
index da584ef..0782701 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -50,6 +50,8 @@ std::string ConvolutionLayerTest::getTestCaseName(testing::TestParamInfo<convLay
 void ConvolutionLayerTest::SetUp() {
     convSpecificParams convParams;
     std::vector<size_t> inputShape;
+    auto inputPrecision = InferenceEngine::Precision::UNSPECIFIED;
+    auto netPrecision   = InferenceEngine::Precision::UNSPECIFIED;
     std::tie(convParams, inputPrecision, netPrecision, inputShape, targetDevice) = this->GetParam();
     ngraph::op::PadType padType;
     InferenceEngine::SizeVector kernel, stride, dilation;
@@ -64,10 +66,14 @@ void ConvolutionLayerTest::SetUp() {
             ngraph::builder::makeConvolution(paramOuts[0], ngPrc, kernel, stride, padBegin,
                                              padEnd, dilation, padType, convOutChannels));
     ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(conv)};
-    fnPtr = std::make_shared<ngraph::Function>(results, params, "convolution");
+    function = std::make_shared<ngraph::Function>(results, params, "convolution");
 }
 
 TEST_P(ConvolutionLayerTest, CompareWithRefs) {
-    inferAndValidate();
+    Run();
+
+    if (targetDevice == std::string{CommonTestUtils::DEVICE_GPU}) {
+        PluginCache::get().reset();
+    }
 }
-}  // namespace LayerTestsDefinitions
\ No newline at end of file
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/nonzero.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/nonzero.cpp
new file mode 100644 (file)
index 0000000..5855ba6
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/nonzero.hpp"
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+
+#include "ie_core.hpp"
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace LayerTestsDefinitions {
+
+std::string NonZeroLayerTest::getTestCaseName(testing::TestParamInfo<NonZeroLayerTestParamsSet> obj) {
+    std::vector<size_t> inputShape;
+    InferenceEngine::Precision inputPrecision, netPrecision;
+    std::string targetDevice;
+    ConfigMap config;
+    std::tie(inputShape, inputPrecision, netPrecision, targetDevice, config) = obj.param;
+
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "inPRC=" << inputPrecision.name() << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void NonZeroLayerTest::SetUp() {
+    std::vector<size_t> inputShape;
+    std::tie(inputShape, inputPrecision, netPrecision, targetDevice, config) = this->GetParam();
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto paramNode = std::make_shared<ngraph::opset1::Parameter>(ngPrc, ngraph::Shape(inputShape));
+
+    auto nonZeroOp = std::make_shared<ngraph::opset3::NonZero>(paramNode->output(0));
+
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(nonZeroOp)};
+    fnPtr = std::make_shared<ngraph::Function>(results, ngraph::ParameterVector{paramNode}, "non_zero");
+}
+
+TEST_P(NonZeroLayerTest, CompareWithRefs) {
+    inferAndValidate();
+}
+}  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/reshape.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/reshape.cpp
new file mode 100644 (file)
index 0000000..e425fda
--- /dev/null
@@ -0,0 +1,56 @@
+// Copyright (C) 2019 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+#include <ie_plugin_config.hpp>
+#include <ie_core.hpp>
+#include <functional>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+#include "single_layer_tests/reshape.hpp"
+
+namespace LayerTestsDefinitions {
+    std::string ReshapeLayerTest::getTestCaseName(testing::TestParamInfo<reshapeParams> obj) {
+    InferenceEngine::Precision inputPrecision, netPrecision;
+    InferenceEngine::SizeVector inputShapes, outFormShapes;
+    std::string targetDevice;
+    std::map<std::string, std::string> config;
+    bool specialZero;
+    std::tie(specialZero, inputPrecision, netPrecision, inputShapes, outFormShapes,
+            targetDevice, config) = obj.param;
+    std::ostringstream result;
+    result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
+    result << "specialZero=" << specialZero << "_";
+    result << "inPRC=" << inputPrecision.name() << "_";
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void ReshapeLayerTest::SetUp() {
+    InferenceEngine::SizeVector inputShapes, outFormShapes;
+    bool specialZero;
+    std::tie(specialZero, inputPrecision, netPrecision, inputShapes, outFormShapes,
+            targetDevice, config) = this->GetParam();
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto paramsIn = ngraph::builder::makeParams(ngPrc, {inputShapes});
+    auto paramIn = ngraph::helpers::convert2OutputVector(
+            ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(paramsIn));
+    auto constNode = std::make_shared<ngraph::opset1::Constant>(
+            ngraph::element::Type_t::i64, ngraph::Shape{outFormShapes.size()}, outFormShapes);
+    auto reshape = std::dynamic_pointer_cast<ngraph::opset1::Reshape>(
+            std::make_shared<ngraph::opset1::Reshape>(paramIn[0], constNode, specialZero));
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(reshape)};
+    fnPtr = std::make_shared<ngraph::Function>(results, paramsIn, "Reshape");
+}
+
+TEST_P(ReshapeLayerTest, CompareWithRefsDynamicBath) {
+    inferAndValidate();
+}
+}  // namespace LayerTestsDefinitions
\ No newline at end of file
diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/softmax.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/softmax.cpp
new file mode 100644 (file)
index 0000000..430d5e7
--- /dev/null
@@ -0,0 +1,68 @@
+// Copyright (C) 2020 Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "single_layer_tests/softmax.hpp"
+
+#include "common_test_utils/common_utils.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
+#include "functional_test_utils/layer_test_utils.hpp"
+
+#include "ie_core.hpp"
+
+#include "ngraph/op/softmax.hpp"
+
+#include <tuple>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace LayerTestsDefinitions {
+
+std::string SoftMaxLayerTest::getTestCaseName(testing::TestParamInfo<softMaxLayerTestParams> obj) {
+    InferenceEngine::Precision netPrecision, inputPrecision;
+    InferenceEngine::Layout inputLayout;
+    InferenceEngine::SizeVector inputShape;
+    size_t axis;
+    std::string targetDevice;
+    std::map<std::string, std::string> config;
+    std::tie(netPrecision, inputPrecision, inputLayout, inputShape, axis, targetDevice, config) = obj.param;
+
+    std::ostringstream result;
+    result << "netPRC=" << netPrecision.name() << "_";
+    result << "inPRC=" << inputPrecision.name() << "_";
+    result << "inLayout=" << inputLayout << "_";
+    result << "IS=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "axis=" << axis << "_";
+    result << "targetDevice=" << targetDevice;
+
+    return result.str();
+}
+
+void SoftMaxLayerTest::SetUp() {
+    InferenceEngine::SizeVector inputShape;
+    size_t axis;
+    std::tie(netPrecision, inputPrecision, inputLayout, inputShape, axis, targetDevice, config) = GetParam();
+    outputPrecision = inputPrecision;
+    outputLayout = inputLayout;
+
+    const auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+
+    const auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+
+    const auto paramOuts =
+        ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+
+    const auto softMax = std::make_shared<ngraph::opset1::Softmax>(paramOuts.at(0), axis);
+
+    const ngraph::ResultVector results {std::make_shared<ngraph::opset1::Result>(softMax)};
+
+    fnPtr = std::make_shared<ngraph::Function>(results, params, "softMax");
+}
+
+TEST_P(SoftMaxLayerTest, CompareWithRefs) {
+    inferAndValidate();
+}
+
+}  // namespace LayerTestsDefinitions
index f56f877..4fec56a 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -52,6 +52,6 @@ void SpaceToBatchLayerTest::SetUp() {
 
 TEST_P(SpaceToBatchLayerTest, CompareWithRefs) {
     inferAndValidate();
-};
+}
 
 }  // namespace LayerTestsDefinitions
diff --git a/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/strided_slice.cpp b/inference-engine/tests/functional/plugin/shared/src/single_layer_tests/strided_slice.cpp
new file mode 100644 (file)
index 0000000..7bc076b
--- /dev/null
@@ -0,0 +1,65 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include <ie_core.hpp>
+#include <ngraph_functions/builders.hpp>
+
+#include "functional_test_utils/blob_utils.hpp"
+#include "functional_test_utils/precision_utils.hpp"
+#include "common_test_utils/common_utils.hpp"
+
+#include "single_layer_tests/strided_slice.hpp"
+
+namespace LayerTestsDefinitions {
+
+std::string StridedSliceLayerTest::getTestCaseName(const testing::TestParamInfo<stridedSliceParamsTuple> &obj) {
+    InferenceEngine::SizeVector inputShape;
+    std::vector<int64_t> begin, end, stride;
+    std::vector<int64_t> begin_mask, new_axis_mask, end_mask, shrink_mask, ellipsis_mask;
+    InferenceEngine::Precision inPrc, netPrc;
+    std::string targetName;
+    std::tie(inputShape, begin, end, stride, begin_mask, end_mask, new_axis_mask, shrink_mask, ellipsis_mask, inPrc, netPrc, targetName) = obj.param;
+    std::ostringstream result;
+    result << "inShape=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "inPRC=" << inPrc.name() << "_";
+    result << "netPRC=" << netPrc.name() << "_";
+    result << "begin=" << CommonTestUtils::vec2str(begin) << "_";
+    result << "end=" << CommonTestUtils::vec2str(end) << "_";
+    result << "stride=" << CommonTestUtils::vec2str(stride) << "_";
+    result << "begin_m=" << CommonTestUtils::vec2str(begin_mask) << "_";
+    result << "end_m=" << CommonTestUtils::vec2str(end_mask) << "_";
+    result << "new_axis_m=" << CommonTestUtils::vec2str(new_axis_mask) << "_";
+    result << "shrink_m=" << CommonTestUtils::vec2str(shrink_mask) << "_";
+    result << "ellipsis_m=" << CommonTestUtils::vec2str(ellipsis_mask) << "_";
+    result << "targetDevice=" << targetName << "_";
+    return result.str();
+}
+
+void StridedSliceLayerTest::SetUp() {
+    InferenceEngine::SizeVector inputShape;
+    std::vector<int64_t> begin, end, stride;
+    std::vector<int64_t> begin_mask, end_mask, new_axis_mask, shrink_mask, ellipsis_mask;
+    std::tie(inputShape, begin, end, stride, begin_mask, end_mask, new_axis_mask, shrink_mask, ellipsis_mask,
+             inputPrecision, netPrecision, targetDevice) = this->GetParam();
+
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+    auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+    auto ss = ngraph::builder::makeStridedSlice(paramOuts[0], begin, end, stride, ngPrc, begin_mask, end_mask, new_axis_mask, shrink_mask, ellipsis_mask);
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(ss)};
+    fnPtr = std::make_shared<ngraph::Function>(results, params, "StridedSlice");
+}
+
+TEST_P(StridedSliceLayerTest, CompareWithRefs) {
+    inferAndValidate();
+}
+
+}  // namespace LayerTestsDefinitions
index 009403a..6dc7473 100644 (file)
@@ -1,7 +1,6 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #pragma once
 
 #include <map>
index e7ba52a..516765b 100644 (file)
@@ -1,7 +1,6 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #pragma once
 
 #include <algorithm>
index 55c67ae..b40b2d7 100644 (file)
@@ -1,7 +1,6 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #pragma once
 
 #include <cmath>
index 0d63d16..f2fa02a 100644 (file)
@@ -1,7 +1,6 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #pragma once
 
 #include <fstream>
index ac0d2d0..10f5182 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 82c7652..a8e7328 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 57158f7..5ad75af 100644 (file)
@@ -1,7 +1,6 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #pragma once
 
 namespace CommonTestUtils {
index 2817c74..280e328 100644 (file)
@@ -4,25 +4,18 @@
 
 set(TARGET_NAME funcTestUtils)
 
-list(APPEND EXPORT_DEPENDENCIES
-        commonTestUtils
-        inference_engine
-        )
+list(APPEND EXPORT_DEPENDENCIES commonTestUtils inference_engine)
 
 addIeTarget(
-        NAME ${TARGET_NAME}
-        TYPE STATIC
-        ROOT ${CMAKE_CURRENT_SOURCE_DIR}
-        ADD_CPPLINT
-        DEVELOPER_PACKAGE
-        EXPORT_DEPENDENCIES
-            ${EXPORT_DEPENDENCIES}
+    NAME ${TARGET_NAME}
+    TYPE STATIC
+    ROOT ${CMAKE_CURRENT_SOURCE_DIR}
+    ADD_CPPLINT
+    DEVELOPER_PACKAGE
+    LINK_LIBRARIES ngraphFunctions
+    EXPORT_DEPENDENCIES ${EXPORT_DEPENDENCIES}
 )
 
-target_include_directories(${TARGET_NAME} PUBLIC
-            $<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
+target_include_directories(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>)
 
-target_link_libraries(${TARGET_NAME}
-        PUBLIC
-        ${EXPORT_DEPENDENCIES}
-        )
\ No newline at end of file
+target_link_libraries(${TARGET_NAME} PUBLIC ${EXPORT_DEPENDENCIES})
index 4c0af8d..7cd467e 100644 (file)
@@ -1,7 +1,6 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #pragma once
 
 
@@ -12,6 +11,7 @@
 
 #include <gtest/gtest.h>
 #include "blob_factory.hpp"
+#include "blob_transform.hpp"
 #include "precision_utils.h"
 #include "common_test_utils/data_utils.hpp"
 #include "common_test_utils/test_constants.hpp"
@@ -240,18 +240,40 @@ InferenceEngine::Blob::Ptr inline createAndFillBlob(const InferenceEngine::Tenso
     blob->allocate();
     switch (td.getPrecision()) {
 #define CASE(X) case X: CommonTestUtils::fill_data_random<X>(blob, range, start_from, resolution); break;
-        CASE(InferenceEngine::Precision::FP32);
-        CASE(InferenceEngine::Precision::FP16);
-        CASE(InferenceEngine::Precision::U8);
-        CASE(InferenceEngine::Precision::U16);
-        CASE(InferenceEngine::Precision::I8);
-        CASE(InferenceEngine::Precision::I16);
-        CASE(InferenceEngine::Precision::I64);
-        CASE(InferenceEngine::Precision::BIN);
+        CASE(InferenceEngine::Precision::FP32)
+        CASE(InferenceEngine::Precision::FP16)
+        CASE(InferenceEngine::Precision::U8)
+        CASE(InferenceEngine::Precision::U16)
+        CASE(InferenceEngine::Precision::I8)
+        CASE(InferenceEngine::Precision::I16)
+        CASE(InferenceEngine::Precision::I64)
+        CASE(InferenceEngine::Precision::BIN)
+        CASE(InferenceEngine::Precision::I32)
 #undef CASE
         default:
             THROW_IE_EXCEPTION << "Wrong precision specified: " << td.getPrecision().name();
     }
     return blob;
 }
-}  // namespace FuncTestUtils
\ No newline at end of file
+
+InferenceEngine::Blob::Ptr inline convertBlobLayout(const InferenceEngine::Blob::Ptr& in,
+                                                    InferenceEngine::Layout layout) {
+    IE_ASSERT(in != nullptr) << "Got NULL pointer";
+
+    const auto& inDesc = in->getTensorDesc();
+
+    if (inDesc.getLayout() == layout) {
+        return in;
+    }
+
+    const auto outDesc = InferenceEngine::TensorDesc(inDesc.getPrecision(), inDesc.getDims(), layout);
+
+    const auto out = make_blob_with_precision(outDesc);
+    out->allocate();
+
+    InferenceEngine::blob_copy(in, out);
+
+    return out;
+}
+
+}  // namespace FuncTestUtils
diff --git a/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp b/inference-engine/tests/ie_test_utils/functional_test_utils/layer_test_utils.cpp
new file mode 100644 (file)
index 0000000..3a6480f
--- /dev/null
@@ -0,0 +1,121 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "layer_test_utils.hpp"
+
+namespace LayerTestsUtils {
+
+FuncTestsCommon::FuncTestsCommon() {
+    core = PluginCache::get().ie(targetDevice).get();
+}
+
+void FuncTestsCommon::Run() {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    Configure();
+    LoadNetwork();
+    Infer();
+    Validate();
+}
+
+FuncTestsCommon::~FuncTestsCommon() {
+    if (!configuration.empty()) {
+        PluginCache::get().reset();
+    }
+}
+
+InferenceEngine::Blob::Ptr FuncTestsCommon::GenerateInput(const InferenceEngine::InputInfo& info) const {
+    return FuncTestUtils::createAndFillBlob(info.getTensorDesc());
+}
+
+void FuncTestsCommon::Compare(const std::vector<std::uint8_t>& expected, const InferenceEngine::Blob::Ptr& actual) {
+    ASSERT_EQ(expected.size(), actual->byteSize());
+    const auto& expectedBuffer = expected.data();
+
+    auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(actual);
+    IE_ASSERT(memory);
+    const auto lockedMemory = memory->wmap();
+    const auto actualBuffer = lockedMemory.as<const std::uint8_t*>();
+
+    const auto& precision = actual->getTensorDesc().getPrecision();
+    const auto& size = actual->size();
+    switch (precision) {
+        case InferenceEngine::Precision::FP32:
+            Compare(reinterpret_cast<const float*>(expectedBuffer), reinterpret_cast<const float*>(actualBuffer), size, 1e-2f);
+            break;
+        case InferenceEngine::Precision::I32:
+            Compare(reinterpret_cast<const std::int32_t*>(expectedBuffer), reinterpret_cast<const std::int32_t*>(actualBuffer), size, 0);
+            break;
+        default:
+            FAIL() << "Comparator for " << precision << " precision isn't supported";
+    }
+}
+
+void FuncTestsCommon::Configure() const {
+    if (!configuration.empty()) {
+        core->SetConfig(configuration, targetDevice);
+    }
+}
+
+void FuncTestsCommon::LoadNetwork() {
+    cnnNetwork = InferenceEngine::CNNNetwork{function};
+    executableNetwork = core->LoadNetwork(cnnNetwork, targetDevice);
+    inferRequest = executableNetwork.CreateInferRequest();
+
+    for (const auto& input : cnnNetwork.getInputsInfo()) {
+        const auto& info = input.second;
+
+        auto blob = GenerateInput(*info);
+        inferRequest.SetBlob(info->name(), blob);
+        inputs.push_back(blob);
+    }
+}
+
+void FuncTestsCommon::Infer() {
+    inferRequest.Infer();
+}
+
+std::vector<InferenceEngine::Blob::Ptr> FuncTestsCommon::GetOutputs() {
+    auto outputs = std::vector<InferenceEngine::Blob::Ptr>{};
+    for (const auto& output : cnnNetwork.getOutputsInfo()) {
+        const auto& name = output.first;
+        outputs.push_back(inferRequest.GetBlob(name));
+    }
+    return outputs;
+}
+
+void FuncTestsCommon::Validate() {
+    // nGraph interpreter does not support f16
+    // IE converts f16 to f32
+    ngraph::pass::ConvertPrecision<ngraph::element::Type_t::f16, ngraph::element::Type_t::f32>().run_on_function(function);
+    function->validate_nodes_and_infer_types();
+
+    auto referenceInputs = std::vector<std::vector<std::uint8_t>>(inputs.size());
+    for (std::size_t i = 0; i < inputs.size(); ++i) {
+        const auto& input = inputs[i];
+        const auto& inputSize = input->byteSize();
+
+        auto& referenceInput = referenceInputs[i];
+        referenceInput.resize(inputSize);
+
+        auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(input);
+        IE_ASSERT(memory);
+        const auto lockedMemory = memory->wmap();
+        const auto buffer = lockedMemory.as<const std::uint8_t*>();
+        std::copy(buffer, buffer + inputSize, referenceInput.data());
+    }
+
+    const auto& expectedOutputs = ngraph::helpers::interpreterFunction(function, referenceInputs);
+    const auto& actualOutputs = GetOutputs();
+    IE_ASSERT(actualOutputs.size() == expectedOutputs.size())
+        << "nGraph interpreter has " << expectedOutputs.size() << " outputs, while IE " << actualOutputs.size();
+
+    for (std::size_t outputIndex = 0; outputIndex < expectedOutputs.size(); ++outputIndex) {
+        const auto& expected = expectedOutputs[outputIndex];
+        const auto& actual = actualOutputs[outputIndex];
+        Compare(expected, actual);
+    }
+}
+
+}  // namespace LayerTestsUtils
index 70ed664..861b87c 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -12,7 +12,7 @@
 #include <gtest/gtest.h>
 #include <ngraph/node.hpp>
 #include <ngraph/function.hpp>
-
+#include <ie_plugin_config.hpp>
 #include <ngraph/function.hpp>
 
 #include "common_test_utils/common_utils.hpp"
@@ -38,20 +38,39 @@ typedef std::tuple<
 template<typename paramType>
 class LayerTestsCommonClass : public CommonTestUtils::TestsCommon, public testing::WithParamInterface<paramType> {
 public:
-    InferenceEngine::Precision netPrecision, inputPrecision;
+    InferenceEngine::Precision netPrecision;
+    InferenceEngine::Precision inputPrecision;
+    InferenceEngine::Precision outputPrecision;
+    InferenceEngine::Layout inputLayout;
+    InferenceEngine::Layout outputLayout;
     std::string targetDevice;
     std::shared_ptr<ngraph::Function> fnPtr;
+    std::map<std::string, std::string> config;
+
+    LayerTestsCommonClass() {
+        netPrecision = InferenceEngine::Precision::UNSPECIFIED;
+        inputPrecision = InferenceEngine::Precision::UNSPECIFIED;
+        outputPrecision = InferenceEngine::Precision::UNSPECIFIED;
+        inputLayout = InferenceEngine::Layout::ANY;
+        outputLayout = InferenceEngine::Layout::ANY;
+    }
 
     void inline inferAndValidate() {
         // Skip test according to plugin specific disabledTestPatterns() (if any)
         SKIP_IF_CURRENT_TEST_IS_DISABLED()
         // Create CNNNetwork from ngrpah::Function
         InferenceEngine::CNNNetwork cnnNet(fnPtr);
-        // Set target input Precisions for the network
-        setNetInOutPrecision(cnnNet, inputPrecision);
+        // Set target input/output Precisions for the network
+        setNetInOutPrecision(cnnNet, inputPrecision, outputPrecision);
+        // Set target input Layouts for the network
+        setNetInOutLayout(cnnNet, inputLayout, outputLayout);
 
         // Get Core from cache
         auto ie = PluginCache::get().ie();
+        // Load config
+        if (!config.empty()) {
+            ie->SetConfig(config, targetDevice);
+        }
         // Load CNNNetwork to target plugins
         auto execNet = ie->LoadNetwork(cnnNet, targetDevice);
         // Create InferRequest
@@ -68,29 +87,54 @@ public:
         // Create input vector with raw data for reference calculation
         std::vector<const float *> inRawData;
         // References are calculated in float precision, so blobs have to be copied and casted if required
-        std::vector<InferenceEngine::Blob::Ptr> castedBlobs = inBlobs;
-        for (size_t i = 0; i < castedBlobs.size(); i++) {
-            if (inputPrecision != InferenceEngine::Precision::FP32) {
-                castedBlobs[i] = FuncTestUtils::copyBlobWithCast<InferenceEngine::Precision::FP32>(inBlobs[i]);
+        std::vector<InferenceEngine::Blob::Ptr> castedBlobs;
+        for (size_t i = 0; i < inBlobs.size(); i++) {
+            const auto precision = inBlobs[i]->getTensorDesc().getPrecision();
+            const auto layout = inBlobs[i]->getTensorDesc().getLayout();
+            const auto defLayout = InferenceEngine::TensorDesc::getLayoutByDims(inBlobs[i]->getTensorDesc().getDims());
+
+            if (precision == InferenceEngine::Precision::FP32 && layout == defLayout) {
+                inRawData.push_back(inBlobs[i]->cbuffer().template as<const float*>());
+            } else {
+                auto castedBlob = FuncTestUtils::copyBlobWithCast<InferenceEngine::Precision::FP32>(inBlobs[i]);
+                castedBlob = FuncTestUtils::convertBlobLayout(castedBlob, defLayout);
+                inRawData.push_back(castedBlob->cbuffer().template as<const float*>());
+                castedBlobs.push_back(castedBlob);
             }
-            inRawData.push_back(castedBlobs[i]->cbuffer().as<float *>());
         }
         // Run inference in IE
         req.Infer();
-
+        // Reset PluginCash
+        if (!config.empty()) {
+            PluginCache::get().reset();
+        }
         // Get output raw data from resulting output blobs
         std::vector<float *> outBlobsRawData;
         std::vector<size_t> outElementsCount;  // output elements count required for compareRawBuffers()
         for (const auto &output : cnnNet.getOutputsInfo()) {
             auto currentBlob = req.GetBlob(output.first);
-            outBlobsRawData.push_back(currentBlob->cbuffer().template as<float *>());
+
             outElementsCount.push_back(
-                    std::accumulate(begin(output.second->getDims()), end(output.second->getDims()), 1,
-                                    std::multiplies<float>()));
+                std::accumulate(
+                    std::begin(output.second->getDims()), std::end(output.second->getDims()),
+                    size_t {1}, std::multiplies<size_t>()));
+
+            const auto precision = currentBlob->getTensorDesc().getPrecision();
+            const auto layout = currentBlob->getTensorDesc().getLayout();
+            const auto defLayout = InferenceEngine::TensorDesc::getLayoutByDims(currentBlob->getTensorDesc().getDims());
+
+            if (precision == InferenceEngine::Precision::FP32 && layout == defLayout) {
+                outBlobsRawData.push_back(currentBlob->cbuffer().template as<float*>());
+            } else {
+                auto castedBlob = FuncTestUtils::copyBlobWithCast<InferenceEngine::Precision::FP32>(currentBlob);
+                castedBlob = FuncTestUtils::convertBlobLayout(castedBlob, defLayout);
+                outBlobsRawData.push_back(castedBlob->cbuffer().template as<float*>());
+                castedBlobs.push_back(castedBlob);
+            }
         }
 
         // Convert initial ngraph::Function to fp32 for references calculation
-        convertFuncToF32(fnPtr, netPrecision);;
+        convertFuncToF32(fnPtr, netPrecision);
         // Run ngraph Interpreter backend to calculate references
         auto refOutData = ngraph::helpers::inferFnWithInterp<ngraph::element::Type_t::f32>(fnPtr, inRawData);
         // Compare IE infer results vs ngraph Interpreter reference results
@@ -99,13 +143,18 @@ public:
 
         // Deallocate ngraph::Function pointer
         fnPtr.reset();
+        if (targetDevice.find(CommonTestUtils::DEVICE_GPU) != std::string::npos) {
+            PluginCache::get().reset();
+        }
     }
 
 protected:
-    void setNetInOutPrecision(InferenceEngine::CNNNetwork &cnnNet, InferenceEngine::Precision inPrc,
+    static void setNetInOutPrecision(InferenceEngine::CNNNetwork &cnnNet, InferenceEngine::Precision inPrc,
                               InferenceEngine::Precision outPrc = InferenceEngine::Precision::UNSPECIFIED) {
-        for (const auto &inputItem : cnnNet.getInputsInfo()) {
-            inputItem.second->setPrecision(inPrc);
+        if (inPrc != InferenceEngine::Precision::UNSPECIFIED) {
+            for (const auto &inputItem : cnnNet.getInputsInfo()) {
+                inputItem.second->setPrecision(inPrc);
+            }
         }
         if (outPrc != InferenceEngine::Precision::UNSPECIFIED) {
             for (const auto &output : cnnNet.getOutputsInfo()) {
@@ -114,6 +163,20 @@ protected:
         }
     }
 
+    static void setNetInOutLayout(InferenceEngine::CNNNetwork& cnnNet, InferenceEngine::Layout inputLayout,
+                                  InferenceEngine::Layout outputLayout = InferenceEngine::Layout::ANY) {
+        if (inputLayout != InferenceEngine::Layout::ANY) {
+            for (const auto& inputItem : cnnNet.getInputsInfo()) {
+                inputItem.second->setLayout(inputLayout);
+            }
+        }
+        if (outputLayout != InferenceEngine::Layout::ANY) {
+            for (const auto& output : cnnNet.getOutputsInfo()) {
+                output.second->setLayout(outputLayout);
+            }
+        }
+    }
+
     void convertFuncToF32(std::shared_ptr<ngraph::Function> fn, InferenceEngine::Precision prc) {
         switch (prc) {
             case InferenceEngine::Precision::FP32:
@@ -154,4 +217,52 @@ inline std::vector<std::shared_ptr<ngraph::Node>> findTargetNodes(std::shared_pt
     return nodes;
 }
 
+using TargetDevice = std::string;
+
+class FuncTestsCommon : public CommonTestUtils::TestsCommon {
+public:
+    virtual InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const;
+    virtual void Run();
+    virtual void Compare(const std::vector<std::uint8_t>& expected, const InferenceEngine::Blob::Ptr& actual);
+
+protected:
+     FuncTestsCommon();
+    ~FuncTestsCommon() override;
+
+    template<class T>
+    void Compare(const T* expected, const T* actual, std::size_t size, T threshold) {
+        for (std::size_t i = 0; i < size; ++i) {
+            const auto& ref = expected[i];
+            const auto& res = actual[i];
+
+            const auto absoluteDifference = std::abs(res - ref);
+            if (absoluteDifference <= threshold) {
+                continue;
+            }
+
+            const auto max = std::max(std::abs(res), std::abs(ref));
+            ASSERT_TRUE(max != 0 && ((absoluteDifference / max) <= threshold))
+                << "Relative comparison of values expected: " << ref << " and actual: " << res << " at index " << i << " with threshold " << threshold
+                << " failed";
+        }
+    }
+
+    TargetDevice targetDevice;
+    std::shared_ptr<ngraph::Function> function;
+    std::map<std::string, std::string> configuration;
+
+private:
+    void Configure() const;
+    void LoadNetwork();
+    void Infer();
+    std::vector<InferenceEngine::Blob::Ptr> GetOutputs();
+    void Validate();
+
+    InferenceEngine::Core* core = nullptr;
+    InferenceEngine::CNNNetwork cnnNetwork;
+    InferenceEngine::ExecutableNetwork executableNetwork;
+    InferenceEngine::InferRequest inferRequest;
+    std::vector<InferenceEngine::Blob::Ptr> inputs;
+};
+
 }  // namespace LayerTestsUtils
index bf9b10f..5698f46 100644 (file)
@@ -9,6 +9,7 @@ list(APPEND EXPORT_DEPENDENCIES
         inference_engine_s
         inference_engine_preproc_s
         inference_engine_lp_transformations
+        inference_engine_ir_readers
         gmock)
 
 addIeTarget(
index 12129e2..05e86e8 100644 (file)
@@ -63,6 +63,9 @@ public:
         inputs[MockNotEmptyICNNNetwork::INPUT_BLOB_NAME] = inputInfo;
     };
     void addLayer(const CNNLayerPtr& layer) noexcept override {}
+    std::shared_ptr<ngraph::Function> getFunction() noexcept override {
+        return nullptr;
+    }
     std::shared_ptr<const ngraph::Function> getFunction() const noexcept override {
         return nullptr;
     }
index bdeb1a8..c9c73e2 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -9,6 +9,7 @@
 
 #include <ngraph/opsets/opset1.hpp>
 #include <ngraph/opsets/opset2.hpp>
+#include <ngraph/opsets/opset3.hpp>
 
 #include "ngraph_functions/utils/data_utils.hpp"
 
@@ -53,5 +54,16 @@ std::shared_ptr<ngraph::Node> makeSpaceToBatch(const ngraph::Output<Node> &in,
                                                const std::vector<size_t> &blockShape,
                                                const std::vector<size_t> &padsBegin,
                                                const std::vector<size_t> &padsEnd);
+
+std::shared_ptr<ngraph::Node> makeStridedSlice(const ngraph::Output<Node> &in,
+                                               const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &end,
+                                               const std::vector<int64_t> &stride,
+                                               const element::Type &type,
+                                               const std::vector<int64_t> &begin_mask,
+                                               const std::vector<int64_t> &end_mask,
+                                               const std::vector<int64_t> &new_axis_mask = std::vector<int64_t>{},
+                                               const std::vector<int64_t> &shrink_mask = std::vector<int64_t>{},
+                                               const std::vector<int64_t> &ellipsis_mask = std::vector<int64_t>{});
 }  // namespace builder
-}  // namespace ngraph
\ No newline at end of file
+}  // namespace ngraph
index c5d57cc..879c35b 100644 (file)
@@ -1,6 +1,18 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
+//*****************************************************************************
+// Copyright 2017-2020 Intel Corporation
 //
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
 
 #pragma once
 
diff --git a/inference-engine/tests/ngraph_functions/include/ngraph_functions/subgraph_builders.hpp b/inference-engine/tests/ngraph_functions/include/ngraph_functions/subgraph_builders.hpp
new file mode 100644 (file)
index 0000000..2032a66
--- /dev/null
@@ -0,0 +1,188 @@
+// Cngraph::opyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_precision.hpp>
+#include <functional_test_utils/precision_utils.hpp>
+#include "ngraph_functions/builders.hpp"
+
+namespace ngraph {
+namespace builder {
+namespace subgraph {
+static std::shared_ptr<ngraph::Function> makeSplitConvConcat(std::vector<size_t> inputShape = {1, 4, 20, 20},
+                                                            InferenceEngine::Precision netPrecision = InferenceEngine::Precision::FP32) {
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+    auto split = ngraph::builder::makeSplit(params[0], ngPrc, 2, 1);
+
+    auto conv1 = ngraph::builder::makeConvolution(split->output(0), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto relu1 = std::make_shared<ngraph::opset1::Relu>(conv1);
+
+    auto conv2 = ngraph::builder::makeConvolution(split->output(1), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto relu2 = std::make_shared<ngraph::opset1::Relu>(conv2);
+
+    auto concat = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{relu1->output(0), relu2->output(0)}, 1);
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(concat)};
+    std::shared_ptr<ngraph::Function> fnPtr = std::make_shared<ngraph::Function>(results, params);
+    return fnPtr;
+}
+
+static std::shared_ptr<ngraph::Function> makeSplitMultiConvConcat(std::vector<size_t> inputShape = {1, 4, 20, 20}) {
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(InferenceEngine::Precision::FP32);
+    auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
+    auto split = ngraph::builder::makeSplit(params[0], ngPrc, 2, 1);
+
+    auto conv1_0 = ngraph::builder::makeConvolution(split->output(0), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto relu1_0 = std::make_shared<ngraph::opset1::Relu>(conv1_0);
+    auto conv1_1 = ngraph::builder::makeConvolution(relu1_0, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto relu1_1 = std::make_shared<ngraph::opset1::Relu>(conv1_1);
+    auto conv1_2 = ngraph::builder::makeConvolution(relu1_1, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                    ngraph::op::PadType::EXPLICIT, 5);
+    auto relu1_2 = std::make_shared<ngraph::opset1::Relu>(conv1_2);
+    auto conv1_3 = ngraph::builder::makeConvolution(relu1_2, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                    ngraph::op::PadType::EXPLICIT, 5);
+    auto relu1_3 = std::make_shared<ngraph::opset1::Relu>(conv1_3);
+    auto conv1_4 = ngraph::builder::makeConvolution(relu1_2, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                    ngraph::op::PadType::EXPLICIT, 5);
+    auto relu1_4 = std::make_shared<ngraph::opset1::Relu>(conv1_4);
+
+    auto conv2_0 = ngraph::builder::makeConvolution(split->output(1), ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                    ngraph::op::PadType::EXPLICIT, 5);
+    auto relu2_0 = std::make_shared<ngraph::opset1::Relu>(conv2_0);
+    auto conv2_1 = ngraph::builder::makeConvolution(relu2_0, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                    ngraph::op::PadType::EXPLICIT, 5);
+    auto relu2_1 = std::make_shared<ngraph::opset1::Relu>(conv2_1);
+    auto conv2_2 = ngraph::builder::makeConvolution(relu2_1, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                    ngraph::op::PadType::EXPLICIT, 5);
+    auto relu2_2 = std::make_shared<ngraph::opset1::Relu>(conv2_2);
+    auto conv2_3 = ngraph::builder::makeConvolution(relu2_2, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                    ngraph::op::PadType::EXPLICIT, 5);
+    auto relu2_3 = std::make_shared<ngraph::opset1::Relu>(conv2_3);
+    auto conv2_4 = ngraph::builder::makeConvolution(relu2_2, ngPrc, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                    ngraph::op::PadType::EXPLICIT, 5);
+    auto relu2_4 = std::make_shared<ngraph::opset1::Relu>(conv2_4);
+
+    auto concat = std::make_shared<ngraph::opset1::Concat>(ngraph::OutputVector{relu1_4->output(0), relu2_4->output(0)}, 1);
+    ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(concat)};
+    std::shared_ptr<ngraph::Function> fnPtr = std::make_shared<ngraph::Function>(results, params);
+    return fnPtr;
+}
+
+static std::shared_ptr<ngraph::Function>
+makeTIwithLSTMcell(InferenceEngine::Precision prc = InferenceEngine::Precision::FP32) {
+    auto ngPRC = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(prc);
+    // That which we iterate over
+    const size_t N = 32; // Batch size
+    const size_t L = 10; // Sequence length
+    const size_t I = 8;  // Input size
+    const size_t H = 32; // Hidden size
+    auto SENT = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, L, I});
+
+    auto H_init = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, H});
+    auto C_init = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, H});
+
+    auto H_t = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, H});
+    auto C_t = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, H});
+
+    // Body
+    auto X = std::make_shared<ngraph::opset1::Parameter>(ngPRC, ngraph::Shape{N, 1, I});
+    std::vector<uint64_t> dataW(4 * H * I, 0);
+    auto W_body = std::make_shared<ngraph::opset1::Constant>(ngPRC, ngraph::Shape{4 * H, I}, dataW);
+    std::vector<uint64_t> dataR(4 * H * H, 0);
+    auto R_body = std::make_shared<ngraph::opset1::Constant>(ngPRC, ngraph::Shape{4 * H, H}, dataR);
+    std::vector<uint64_t> inShape = {N, H};
+    auto constantH = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, ngraph::Shape{2}, inShape);
+    inShape = {N, I};
+    auto constantX = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, ngraph::Shape{2}, inShape);
+    auto LSTM_cell =
+            std::make_shared<ngraph::opset1::LSTMCell>(std::make_shared<ngraph::opset1::Reshape>(X, constantX, false),
+                                                   std::make_shared<ngraph::opset1::Reshape>(H_t, constantH, false),
+                                                   std::make_shared<ngraph::opset1::Reshape>(C_t, constantH, false),
+                                                   W_body,
+                                                   R_body,
+                                                   H);
+    inShape = {N, 1, H};
+    auto constantHo = std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{3}, inShape);
+    auto H_o = std::make_shared<ngraph::opset1::Reshape>(LSTM_cell->output(0), constantHo, false);
+    auto C_o = std::make_shared<ngraph::opset1::Reshape>(LSTM_cell->output(1), constantHo, false);
+    auto body = std::make_shared<ngraph::op::TensorIterator::BodyLambda>(
+            ngraph::OutputVector{H_o, C_o}, ngraph::ParameterVector{X, H_t, C_t});
+
+    auto tensor_iterator = std::make_shared<ngraph::op::TensorIterator>();
+    tensor_iterator->set_body(body);
+    // start=0, stride=1, part_size=1, end=39, axis=1
+    tensor_iterator->set_sliced_input(X, SENT, 0, 1, 1, -1, 1);
+    // H_t is Hinit on the first iteration, Ho after that
+    tensor_iterator->set_merged_input(H_t, H_init, H_o);
+    tensor_iterator->set_merged_input(C_t, C_init, C_o);
+
+    // Output 0 is last Ho, result 0 of body
+    auto out0 = tensor_iterator->get_iter_value(H_o, -1);
+    // Output 1 is last Co, result 1 of body
+    auto out1 = tensor_iterator->get_iter_value(C_o, -1);
+
+    auto results = ngraph::ResultVector{std::make_shared<ngraph::opset1::Result>(out0),
+                                        std::make_shared<ngraph::opset1::Result>(out1)};
+    auto fn_ptr = std::make_shared<ngraph::Function>(results, ngraph::ParameterVector{SENT, H_init, C_init});
+    return fn_ptr;
+}
+
+static std::shared_ptr<ngraph::Function> makeSingleConv(std::vector<size_t> inputShape = {1, 3, 24, 24},
+                                                        InferenceEngine::Precision prc = InferenceEngine::Precision::FP32) {
+    ngraph::element::Type type = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(prc);
+    auto param0 = std::make_shared<ngraph::opset1::Parameter>(type, ngraph::Shape(inputShape));
+    auto conv1 = ngraph::builder::makeConvolution(param0, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto result = std::make_shared<ngraph::opset1::Result>(conv1);
+    auto fn_ptr = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{param0});
+    return
+            fn_ptr;
+}
+
+static std::shared_ptr<ngraph::Function> makeMultiSingleConv(std::vector<size_t> inputShape = {1, 3, 24, 24}) {
+    ngraph::element::Type type = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(InferenceEngine::Precision::FP32);
+    auto param0 = std::make_shared<ngraph::opset1::Parameter>(type, ngraph::Shape(inputShape));
+    auto conv1 = ngraph::builder::makeConvolution(param0, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto conv2 = ngraph::builder::makeConvolution(conv1, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto conv3 = ngraph::builder::makeConvolution(conv2, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto conv4 = ngraph::builder::makeConvolution(conv3, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto conv5 = ngraph::builder::makeConvolution(conv4, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto conv6 = ngraph::builder::makeConvolution(conv5, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto conv7 = ngraph::builder::makeConvolution(conv6, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto conv8 = ngraph::builder::makeConvolution(conv7, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                 ngraph::op::PadType::EXPLICIT, 5);
+    auto conv9 = ngraph::builder::makeConvolution(conv8, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto conv10 = ngraph::builder::makeConvolution(conv9, type, {3, 3}, {1, 1}, {0, 0}, {0, 0}, {1, 1},
+                                                  ngraph::op::PadType::EXPLICIT, 5);
+    auto result = std::make_shared<ngraph::opset1::Result>(conv10);
+    auto fn_ptr = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{param0});
+    return
+            fn_ptr;
+}
+
+static std::shared_ptr<ngraph::Function> make2InputSubtract(std::vector<size_t> inputShape = {1, 3, 24, 24},
+                                                            InferenceEngine::Precision prc = InferenceEngine::Precision::FP32) {
+    ngraph::element::Type type = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(prc);
+    auto param0 = std::make_shared<ngraph::opset1::Parameter>(type, ngraph::Shape(inputShape));
+    auto param1 = std::make_shared<ngraph::opset1::Parameter>(type, ngraph::Shape(inputShape));
+    auto subtract = std::make_shared<ngraph::opset1::Subtract>(param0, param1);
+    auto result = std::make_shared<ngraph::opset1::Result>(subtract);
+    return std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{param0, param1});
+}
+}  // namespace subgraph
+}  // namespace builder
+}  // namespace ngraph
\ No newline at end of file
index ea8aff8..6fbdc07 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporationconvert2OutputVector
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include <vector>
 #include <memory>
 
+#include <ngraph/runtime/interpreter/int_backend_visibility.hpp>
 #include <ngraph/opsets/opset1.hpp>
 #include <ngraph/runtime/backend_manager.hpp>
-#include <ngraph/component_manager.hpp>
 #include <ngraph/runtime/backend.hpp>
 #include <ngraph/runtime/tensor.hpp>
 
+extern "C" INTERPRETER_BACKEND_API void ngraph_register_interpreter_backend();
+
 namespace ngraph {
 namespace helpers {
 
@@ -111,6 +113,7 @@ inferFnWithInterp(const std::shared_ptr<ngraph::Function> &fn,
     ngraph::runtime::Backend::set_backend_shared_library_search_directory("");
 
     ngraph_register_interpreter_backend();
+
     auto backend = ngraph::runtime::Backend::create("INTERPRETER");
 
     std::vector<std::shared_ptr<ngraph::runtime::Tensor>> inTensors;
@@ -138,5 +141,8 @@ inferFnWithInterp(const std::shared_ptr<ngraph::Function> &fn,
     }
     return outData;
 }
+
+std::vector<std::vector<std::uint8_t>> interpreterFunction(const std::shared_ptr<Function>& function, const std::vector<std::vector<std::uint8_t>>& inputs);
+
 }  // namespace helpers
 }  // namespace ngraph
index aa692d1..9745164 100644 (file)
@@ -1,7 +1,8 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
+
 #include <vector>
 #include <memory>
 
index 135f097..6ce6860 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 5c5b47c..5600015 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 77094cb..34abb81 100644 (file)
@@ -1,6 +1,7 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+//
 
 #include <vector>
 #include <memory>
index 6c6dce6..0ed3a35 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 964ede0..9e6b628 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2020 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 3d00a0f..afdcd75 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/tests/ngraph_functions/src/strided_slice.cpp b/inference-engine/tests/ngraph_functions/src/strided_slice.cpp
new file mode 100644 (file)
index 0000000..5fdd48b
--- /dev/null
@@ -0,0 +1,28 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph_functions/builders.hpp"
+
+namespace ngraph {
+namespace builder {
+std::shared_ptr<ngraph::Node> makeStridedSlice(const ngraph::Output<Node> &in,
+                                               const std::vector<int64_t> &begin,
+                                               const std::vector<int64_t> &end,
+                                               const std::vector<int64_t> &stride,
+                                               const element::Type &type,
+                                               const std::vector<int64_t> &begin_mask,
+                                               const std::vector<int64_t> &end_mask,
+                                               const std::vector<int64_t> &new_axis_mask,
+                                               const std::vector<int64_t> &shrink_mask,
+                                               const std::vector<int64_t> &ellipsis_mask) {
+    ngraph::Shape constShape = {in.get_shape().size()};
+    auto beginNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, constShape, begin.data());
+    auto endNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, constShape, end.data());
+    auto strideNode = std::make_shared<ngraph::opset1::Constant>(ngraph::element::i64, constShape, stride.data());
+    auto ssNode = std::make_shared<ngraph::opset2::StridedSlice>(in, beginNode, endNode, strideNode, begin_mask, end_mask);
+    return ssNode;
+}
+
+}  // namespace builder
+}  // namespace ngraph
index cb9a594..fe91c4a 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2019 Intel Corporationconvert2OutputVector
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,6 +7,8 @@
 
 #include <ngraph/opsets/opset1.hpp>
 
+#include <ngraph_functions/utils/ngraph_helpers.hpp>
+
 namespace ngraph {
 namespace helpers {
 
@@ -20,13 +22,51 @@ ngraph::OutputVector convert2OutputVector(const std::vector<std::shared_ptr<ngra
     return outs;
 }
 
-template<class opType>
-ngraph::NodeVector castOps2Nodes(const std::vector<std::shared_ptr<opType>> &ops) {
-    ngraph::NodeVector nodes;
-    for (const auto &op : ops) {
-        nodes.push_back(std::dynamic_pointer_cast<ngraph::Node>(op));
+std::vector<std::vector<std::uint8_t>> interpreterFunction(const std::shared_ptr<Function>& function, const std::vector<std::vector<std::uint8_t>>& inputs) {
+    ngraph::runtime::Backend::set_backend_shared_library_search_directory("");
+    ngraph_register_interpreter_backend();
+    auto backend = ngraph::runtime::Backend::create("INTERPRETER");
+
+    const auto& parameters = function->get_parameters();
+    const auto& parametersNumber = parameters.size();
+    const auto& inputsNumber = inputs.size();
+    NGRAPH_CHECK(parametersNumber == inputsNumber,
+        "Got function (", function->get_friendly_name(), ") with ", parametersNumber, " parameters, but ", inputsNumber, " input blobs");
+
+    auto inputTensors = std::vector<std::shared_ptr<runtime::Tensor>>{};
+    for (const auto& parameter : parameters) {
+        const auto& parameterIndex = function->get_parameter_index(parameter);
+        const auto& parameterShape = parameter->get_shape();
+        const auto& parameterType  = parameter->get_element_type();
+        const auto& parameterSize  = ngraph::shape_size(parameterShape) * parameterType.size();
+
+        const auto& input = inputs[parameterIndex];
+        const auto& inputSize = input.size();
+        NGRAPH_CHECK(parameterSize == inputSize,
+            "Got parameter (", parameter->get_friendly_name(), ") of size ", parameterSize, " bytes, but corresponding input with index ", parameterIndex,
+            " has ", inputSize, " bytes");
+
+        auto tensor = backend->create_tensor(parameterType, parameterShape);
+        tensor->write(input.data(), parameterSize);
+        inputTensors.push_back(tensor);
     }
-    return nodes;
+
+    auto outputTensors = std::vector<std::shared_ptr<runtime::Tensor>>{};
+    const auto& results = function->get_results();
+    std::transform(results.cbegin(), results.cend(), std::back_inserter(outputTensors), [&backend](const std::shared_ptr<op::Result>& result) {
+        return backend->create_tensor(result->get_element_type(), result->get_shape()); });
+
+    auto handle = backend->compile(function);
+    handle->call_with_validate(outputTensors, inputTensors);
+    auto outputs = std::vector<std::vector<std::uint8_t>>(results.size());
+    for (const auto& result : results) {
+        const auto& resultIndex = function->get_result_index(result);
+        auto& output = outputs[resultIndex];
+        output.resize(ngraph::shape_size(result->get_shape()) * result->get_element_type().size());
+        outputTensors[resultIndex]->read(output.data(), output.size());
+    }
+
+    return outputs;
 }
 
 }  // namespace helpers
index 79ceb78..7769b67 100644 (file)
@@ -1,4 +1,4 @@
-# Copyright (C) 2019 Intel Corporation
+# Copyright (C) 2018-2020 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 
index 00abeb5..f11f274 100644 (file)
@@ -1,7 +1,6 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #define INTEL_GNA_DLLEXPORT 1
 
 #if GNA_LIB_VER == 1
index 9b335c8..d94c8c1 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp b/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp
new file mode 100644 (file)
index 0000000..fb0ac0e
--- /dev/null
@@ -0,0 +1,176 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <gna/gna_config.hpp>
+#include "gna_plugin_config.hpp"
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <map>
+
+using namespace InferenceEngine;
+using namespace GNAPluginNS;
+
+const std::map<std::string, std::string>  supportedConfigKeysWithDefaults = {
+    {GNA_CONFIG_KEY(SCALE_FACTOR), "1.000000"},
+    {GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_0"), "1.000000"},
+    {GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), ""},
+    {GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION), ""},
+    {GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_SW_EXACT},
+    {GNA_CONFIG_KEY(COMPACT_MODE), CONFIG_VALUE(YES)},
+    {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(NO)},
+    {GNA_CONFIG_KEY(PRECISION), Precision(Precision::I16).name()},
+    {GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), CONFIG_VALUE(NO)},
+    {CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(NO)},
+    {GNA_CONFIG_KEY(LIB_N_THREADS), "1"},
+    {CONFIG_KEY(SINGLE_THREAD), CONFIG_VALUE(YES)}
+};
+
+class GNAPluginConfigTest : public ::testing::Test {
+protected:
+    Config config;
+    void SetAndCompare(const std::string& key, const std::string& val) {
+        config.UpdateFromMap({{key, val}});
+        EXPECT_EQ(config.GetParameter(key), val);
+    }
+    void ExpectThrow(const std::string& key, const std::string& val) {
+        EXPECT_THROW(config.UpdateFromMap({{key, val}}),
+                     details::InferenceEngineException);
+    }
+    void SetAndCheckFlag(const std::string& key, bool& val, bool reverse = false) {
+        const bool yes = reverse ? false : true;
+        const bool no = !yes;
+        SetAndCompare(key, CONFIG_VALUE(YES));
+        EXPECT_EQ(val, yes);
+        SetAndCompare(key, CONFIG_VALUE(NO));
+        EXPECT_EQ(val, no);
+        SetAndCompare(key, CONFIG_VALUE(YES));
+        EXPECT_EQ(val, yes);
+        ExpectThrow(key, "abc");
+        ExpectThrow(key, "");
+    }
+};
+
+TEST_F(GNAPluginConfigTest, GnaConfigDefaultConfigIsExpected) {
+    ASSERT_EQ(config.key_config_map, supportedConfigKeysWithDefaults);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigScaleFactorTest) {
+    config.UpdateFromMap({{GNA_CONFIG_KEY(SCALE_FACTOR), std::string("34")}});
+    EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR)), std::string("34.000000"));
+    EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_0")), std::string("34.000000"));
+    EXPECT_EQ(config.inputScaleFactors.size(), 1);
+    EXPECT_FLOAT_EQ(config.inputScaleFactors[0], 34.0);
+
+    config.UpdateFromMap({{GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_3"), std::string("15.2")}});
+    EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR)), std::string("34.000000"));
+    EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_0")), std::string("34.000000"));
+    EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_1")), std::string("1.000000"));
+    EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_2")), std::string("1.000000"));
+    EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_3")), std::string("15.200000"));
+    EXPECT_EQ(config.inputScaleFactors.size(), 4);
+    EXPECT_FLOAT_EQ(config.inputScaleFactors[0], 34.0);
+    EXPECT_FLOAT_EQ(config.inputScaleFactors[1], 1.0);
+    EXPECT_FLOAT_EQ(config.inputScaleFactors[2], 1.0);
+    EXPECT_FLOAT_EQ(config.inputScaleFactors[3], 15.2);
+
+    config.UpdateFromMap({{GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_9"), std::string("8.43")}});
+    EXPECT_EQ(config.GetParameter(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_9")), std::string("8.430000"));
+    EXPECT_EQ(config.inputScaleFactors.size(), 10);
+    EXPECT_FLOAT_EQ(config.inputScaleFactors[9], 8.43);
+
+    ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_10"), std::string("8.43"));
+    ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("&1"), std::string("8.43"));
+    ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("_"), std::string("8.43"));
+    ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR) + std::string("abs"), std::string("8.43"));
+    ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR), std::string("abc"));
+    ExpectThrow(GNA_CONFIG_KEY(SCALE_FACTOR), std::string("0"));
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigFirmwareModelImageTest) {
+    SetAndCompare(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), "abc");
+    EXPECT_EQ(config.dumpXNNPath, "abc");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigFirmwareModelImageGeneratorTest) {
+    SetAndCompare(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE_GENERATION), "def");
+    EXPECT_EQ(config.dumpXNNGeneration, "def");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigDeviceModeTest) {
+    SetAndCompare(GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_HW);
+#if GNA_LIB_VER == 1
+    EXPECT_EQ(config.gna_proc_type, static_cast<intel_gna_proc_t>(GNA_HARDWARE));
+#else
+    EXPECT_EQ(config.pluginGna2AccMode, Gna2AccelerationModeHardware);
+    EXPECT_EQ(config.pluginGna2DeviceConsistent, Gna2DeviceVersionSoftwareEmulation);
+#endif
+    SetAndCompare(GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_SW);
+#if GNA_LIB_VER == 1
+    EXPECT_EQ(config.gna_proc_type, static_cast<intel_gna_proc_t>(GNA_SOFTWARE));
+#else
+    EXPECT_EQ(config.pluginGna2AccMode, Gna2AccelerationModeSoftware);
+    EXPECT_EQ(config.pluginGna2DeviceConsistent, Gna2DeviceVersionSoftwareEmulation);
+#endif
+    SetAndCompare(GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_SW_EXACT);
+#if GNA_LIB_VER == 1
+    EXPECT_EQ(config.gna_proc_type, static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE));
+#else
+    EXPECT_EQ(config.pluginGna2AccMode, Gna2AccelerationModeSoftware);
+    EXPECT_EQ(config.pluginGna2DeviceConsistent, Gna2DeviceVersion1_0);
+#endif
+    SetAndCompare(GNA_CONFIG_KEY(DEVICE_MODE), GNAConfigParams::GNA_AUTO);
+#if GNA_LIB_VER == 1
+    EXPECT_EQ(config.gna_proc_type, static_cast<intel_gna_proc_t>(GNA_AUTO));
+#else
+    EXPECT_EQ(config.pluginGna2AccMode, Gna2AccelerationModeAuto);
+    EXPECT_EQ(config.pluginGna2DeviceConsistent, Gna2DeviceVersionSoftwareEmulation);
+#endif
+    ExpectThrow(GNA_CONFIG_KEY(DEVICE_MODE), "");
+    ExpectThrow(GNA_CONFIG_KEY(DEVICE_MODE), "abc");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigCompactMode) {
+    SetAndCheckFlag(GNA_CONFIG_KEY(COMPACT_MODE),
+                    config.gnaFlags.compact_mode);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigExclusiveAsyncRequestTest) {
+    SetAndCheckFlag(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),
+                    config.gnaFlags.exclusive_async_requests);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigPrecisionTest) {
+    SetAndCompare(GNA_CONFIG_KEY(PRECISION), Precision(Precision::I8).name());
+    EXPECT_EQ(config.gnaPrecision, Precision::I8);
+    SetAndCompare(GNA_CONFIG_KEY(PRECISION), Precision(Precision::I16).name());
+    EXPECT_EQ(config.gnaPrecision, Precision::I16);
+    ExpectThrow(GNA_CONFIG_KEY(PRECISION), Precision(Precision::FP32).name());
+    ExpectThrow(GNA_CONFIG_KEY(PRECISION), "");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigPwlUniformDesignTest) {
+    SetAndCheckFlag(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN),
+                    config.gnaFlags.uniformPwlDesign);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigPerfCountTest) {
+    SetAndCheckFlag(CONFIG_KEY(PERF_COUNT),
+                    config.gnaFlags.performance_counting);
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigLibNThreadsTest) {
+    SetAndCompare(GNA_CONFIG_KEY(LIB_N_THREADS), "2");
+    EXPECT_EQ(config.gnaFlags.gna_lib_async_threads_num, 2);
+    SetAndCompare(GNA_CONFIG_KEY(LIB_N_THREADS), "25");
+    EXPECT_EQ(config.gnaFlags.gna_lib_async_threads_num, 25);
+    ExpectThrow(GNA_CONFIG_KEY(LIB_N_THREADS), "");
+    ExpectThrow(GNA_CONFIG_KEY(LIB_N_THREADS), "0");
+    ExpectThrow(GNA_CONFIG_KEY(LIB_N_THREADS), "128");
+    ExpectThrow(GNA_CONFIG_KEY(LIB_N_THREADS), "abc");
+}
+
+TEST_F(GNAPluginConfigTest, GnaConfigSingleThreadTest) {
+    SetAndCheckFlag(CONFIG_KEY(SINGLE_THREAD),
+                    config.gnaFlags.gna_openmp_multithreading,
+                    true);
+}
index 6c76eaa..ec3416e 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 7f3be7f..c0b0380 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index d8c6097..d93df51 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018-2019 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
index 14265fa..281f283 100644 (file)
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
-set(TARGET_NAME helpers)
+set(TARGET_NAME ieTestHelpers)
 
 file(GLOB HELPERS_SRC
        ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
@@ -15,13 +15,13 @@ file (GLOB HELPERS_INCLUDES
 
 ## Enable Models multiple search pathes
 message("configuring file: ${CMAKE_CURRENT_BINARY_DIR}/test_model_repo.h")
-configure_file(test_model_repo.hpp.in ${CMAKE_CURRENT_BINARY_DIR}/test_model_repo.hpp @ONLY)
 
 function(add_helpers target_name)
     add_library(${target_name} STATIC ${HELPERS_SRC})
 
     target_include_directories(${target_name} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}"
                                                      "${IE_MAIN_SOURCE_DIR}/src/inference_engine"
+                                                     $<TARGET_PROPERTY:inference_engine_ir_readers,INTERFACE_INCLUDE_DIRECTORIES>
                                                      $<TARGET_PROPERTY:inference_engine_lp_transformations,INTERFACE_INCLUDE_DIRECTORIES>
                                                      $<TARGET_PROPERTY:pugixml,INTERFACE_INCLUDE_DIRECTORIES>
                                                      "${IE_MAIN_SOURCE_DIR}/src/vpu/"
@@ -31,9 +31,6 @@ function(add_helpers target_name)
     target_include_directories(${target_name} PUBLIC
         "${IE_MAIN_SOURCE_DIR}/samples/common/os/windows")
 
-    target_compile_definitions(${target_name} PUBLIC ${ARGV}
-        MODELS_PATH=\"${MODELS_PATH}\" DATA_PATH=\"${VALIDATION_SET}\")
-
     set_property(TARGET ${target_name} PROPERTY COMPILE_PDB_NAME ${target_name})
 
     # add_cpplint_target(${target_name}_cpplint FOR_TARGETS ${target_name})
index 9297160..a37f921 100644 (file)
@@ -66,20 +66,29 @@ BufferWrapper::BufferWrapper(const Blob::Ptr& blob, Precision _precision) : prec
         fp16_ptr = blob->buffer().as<ie_fp16*>();
     } else if (precision == Precision::FP32) {
         fp32_ptr = blob->buffer().as<float*>();
+    } else if (precision == Precision::I32) {
+        i32_ptr = blob->buffer().as<int32_t*>();
     } else {
         THROW_IE_EXCEPTION << "Unsupported precision for compare: " << precision;
     }
 }
 
 float BufferWrapper::operator[](size_t index) {
-    if (precision == Precision::FP16) return PrecisionUtils::f16tof32(fp16_ptr[index]);
+    if (precision == Precision::FP16) {
+        return PrecisionUtils::f16tof32(fp16_ptr[index]);
+    } else if (precision == Precision::I32) {
+        return i32_ptr[index];
+    }
     return fp32_ptr[index];
 }
 
 void BufferWrapper::insert(size_t index, float value) {
     if (precision == Precision::FP16) {
         fp16_ptr[index] = PrecisionUtils::f32tof16(value);
-    } else {
+    } else if (precision == Precision::I32) {
+        i32_ptr[index] = value;
+    }
+    else {
         fp32_ptr[index] = value;
     }
 }
index 540bb4a..27ef66b 100644 (file)
@@ -127,6 +127,7 @@ class BufferWrapper {
     InferenceEngine::Precision precision;
     InferenceEngine::ie_fp16 *fp16_ptr;
     float *fp32_ptr;
+    int32_t *i32_ptr;
 public:
     explicit BufferWrapper(const InferenceEngine::Blob::Ptr &blob);
 
@@ -38,27 +38,8 @@ static std::string getDirname(std::string filePath) {
 }
 #endif
 
-const char* getModelPathNonFatal() noexcept {
-#ifdef MODELS_PATH
-    const char* models_path = std::getenv("MODELS_PATH");
-
-    if (models_path == nullptr && MODELS_PATH == nullptr) {
-        return nullptr;
-    }
-
-    if (models_path == nullptr) {
-        return MODELS_PATH;
-    }
-
-    return models_path;
-#else
-    return nullptr;
-#endif
-}
-
-
 static std::string get_models_path() {
-    const char* models_path = getModelPathNonFatal();
+    const char* models_path = TestDataHelpers::getModelPathNonFatal();
 
     if (nullptr == models_path) {
         ::testing::AssertionFailure() << "MODELS_PATH not defined";
@@ -97,10 +78,10 @@ static std::vector<std::string> getModelsDirs() {
 }
 
 ModelsPath::operator std::string() const {
-
     std::vector<std::string> absModelsPath;
     for (auto & path  : getModelsDirs()) {
-        const auto absPath = get_models_path() + kPathSeparator + "src" + kPathSeparator + path + _rel_path.str();
+        std::string b = get_models_path();
+        const auto absPath = get_models_path() + kPathSeparator + path + _rel_path.str();
         absModelsPath.push_back(absPath);
         if (exist(absPath)) {
             return absPath;
diff --git a/inference-engine/tests_deprecated/helpers/test_model_repo.hpp b/inference-engine/tests_deprecated/helpers/test_model_repo.hpp
new file mode 100644 (file)
index 0000000..2725631
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <string>
+
+std::string get_model_repo();
+
+namespace TestDataHelpers {
+
+const char *getModelPathNonFatal() noexcept;
+
+std::string get_data_path();
+
+inline const char *getModelPathNonFatalDefault() noexcept {
+#ifdef MODELS_PATH
+    const char *models_path = std::getenv("MODELS_PATH");
+
+    if (models_path == nullptr && MODELS_PATH == nullptr) {
+        return nullptr;
+    }
+
+    if (models_path == nullptr) {
+        return MODELS_PATH;
+    }
+
+    return models_path;
+#else
+    return nullptr;
+#endif
+};
+
+inline std::string get_data_path_default() {
+#ifdef DATA_PATH
+    const char *data_path = std::getenv("DATA_PATH");
+
+    if (data_path == NULL) {
+        if (DATA_PATH != NULL) {
+            data_path = DATA_PATH;
+        } else {
+            return nullptr;
+        }
+    }
+    return std::string(data_path);
+#else
+    return nullptr;
+#endif
+}
+}  // namespace TestDataHelpers
diff --git a/inference-engine/tests_deprecated/helpers/test_model_repo.hpp.in b/inference-engine/tests_deprecated/helpers/test_model_repo.hpp.in
deleted file mode 100644 (file)
index 3f7ad56..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-#include <string>
-std::string get_model_repo() {
-    return "@MODELS_LST@";
-}
\ No newline at end of file
index 57b6f6b..c7a8150 100644 (file)
@@ -173,7 +173,7 @@ static std::vector<std::shared_ptr<BaseTestCreator>>& getCreators() {
             std::make_shared<LayerTestCreator<InferenceEngine::ReduceLayer>>("ReduceSumSquare"),
             std::make_shared<LayerTestCreator<InferenceEngine::TopKLayer>>("TopK"),
             std::make_shared<LayerTestCreator<InferenceEngine::NonMaxSuppressionLayer>>("NonMaxSuppression"),
-            std::make_shared<LayerTestCreator<InferenceEngine::ScatterLayer>>("ScatterUpdate")
+            std::make_shared<LayerTestCreator<InferenceEngine::ScatterUpdateLayer>>("ScatterUpdate")
     };
     return creators;
 }
index 03b027e..cbbd110 100644 (file)
@@ -20,6 +20,7 @@
 #include <ie_input_info.hpp>
 #include <ie_icnn_network.hpp>
 
+#include "test_model_repo.hpp"
 #include "test_model_path.hpp"
 #include <tests_file_utils.hpp>
 #include <chrono>
@@ -41,11 +42,14 @@ inline std::string to_string_c_locale(T value) {
 class TestsCommon : public ::testing::Test {
 public:
     IE_SUPPRESS_DEPRECATED_START
-    static InferenceEngine::CNNLayer::Ptr createLayer(const std::string& type);
+
+    static InferenceEngine::CNNLayer::Ptr createLayer(const std::string &type);
+
     IE_SUPPRESS_DEPRECATED_END
 
 protected:
     void SetUp() override;
+
     void TearDown() override;
 
 public:
@@ -53,19 +57,6 @@ public:
         return make_plugin_name("mock_engine");
     }
 
-    static std::string get_data_path(){
-        const char* data_path = std::getenv("DATA_PATH");
-
-        if (data_path == NULL){
-            if(DATA_PATH != NULL){
-                data_path = DATA_PATH;
-            } else{
-                ::testing::AssertionFailure()<<"DATA_PATH not defined";
-            }
-        }
-        return std::string(data_path);
-    }
-
     static std::string make_so_name(const std::string & input) {
         return CommonTestUtils::pre + input + IE_BUILD_POSTFIX + CommonTestUtils::ext;
     }
index 21fef55..1a968f7 100644 (file)
@@ -157,15 +157,16 @@ endif ()
 
 target_link_libraries(${TARGET_NAME} PRIVATE
     # static libraries
+    inference_engine_s # need to have this explicitly for USE_STATIC_IE
     unitTestUtils
-    helpers_s
+    ieTestHelpers_s
     ${GNA_TEST_ENGINE}
 
     # dynamic libraries
     inference_engine_lp_transformations
+    inference_engine_ir_readers
     inference_engine_transformations
-    ${CMAKE_DL_LIBS}
-    )
+    ${CMAKE_DL_LIBS})
 
 if(TARGET libGNAStubs)
     target_link_libraries(${TARGET_NAME} PRIVATE libGNAStubs)
@@ -175,6 +176,11 @@ if (ENABLE_MKL_DNN)
     target_link_libraries(${TARGET_NAME} PRIVATE mkldnn)
 endif ()
 
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fuse-ld=gold")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold")
+endif()
+
 add_test(NAME ${TARGET_NAME}
         COMMAND ${TARGET_NAME})
 
index 85f5975..6edab00 100644 (file)
@@ -29,7 +29,8 @@ TEST_F (V2FormatParserTest, invalidXml_ShouldThrow) {
             .node("net")
             .attr("name", "AlexNet").attr("version", 2);
 
-    ASSERT_THROW(parse(content), InferenceEngine::details::InferenceEngineException);
+    // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+    ASSERT_THROW(parse(content), std::exception);
 }
 
 TEST_F (V2FormatParserTest, canParseDims) {
index 7958f60..a4e2f86 100644 (file)
@@ -324,6 +324,9 @@ void GNAPropagateMatcher :: match() {
                     case GnaPluginTestEnvironment::matchAffineWeights:
                         HasWeightsEq(combined, _env.transposedData);
                         break;
+                    case GnaPluginTestEnvironment::matchAffineWeightsSize:
+                        HasWeightsSizeEq(combined, _env.matched_weight_size);
+                        break;
                     case GnaPluginTestEnvironment::saveAffineWeights:
                         SaveWeights(combined, _env.transposedData, _env.transposedArgsForSaving);
                         break;
index ee07ca2..a96224e 100644 (file)
@@ -71,7 +71,8 @@ class GnaPluginTestEnvironment {
         fillOutputValues,
         matchAffineWeightsTranspose,
         matchAffineWeights,
-        saveAffineWeights
+        matchAffineWeightsSize,
+        saveAffineWeights,
     };
     enum {
         kUnset = -1,
@@ -113,6 +114,8 @@ class GnaPluginTestEnvironment {
     std::pair<int, int> transposedArgsForSaving;
     std::vector<uint16_t>* transposedData;
     std::vector<DnnActivationType> pwlsToMatchWith;
+    size_t matched_weight_size = 0;
+    size_t nCopyLayersToMatch = -1;
 };
 
 class GNATestBase {
@@ -162,6 +165,14 @@ class GNATestConfigurability : public GNATestBase{
         _env.config[keyName] = ss.str();
         return *dynamic_cast<T*>(this);
     }
+    T & onCPU() {
+        _env.config[GNA_CONFIG_KEY(DEVICE_MODE)] = GNA_CONFIG_VALUE(SW_FP32);
+        return *dynamic_cast<T*>(this);
+    }
+    T & withPolicy(GNAPluginNS::Policy::ConcatAlignment concatAlignmentPolicy) {
+        _env.policy.ConcatAlignmentPolicy = concatAlignmentPolicy;
+        return *dynamic_cast<T*>(this);
+    }
     T & withGNADeviceMode(std::string value) {
         _env.config[GNA_CONFIG_KEY(DEVICE_MODE)] = value;
         return *dynamic_cast<T*>(this);
@@ -210,10 +221,6 @@ class GNAPropagateMatcher : public GNATestConfigurability<GNAPropagateMatcher> {
         return *this;
     }
 
-    GNAPropagateMatcher & And() {
-        return *this;
-    }
-
     GNAPropagateMatcher & that() {
         return *this;
     }
@@ -268,7 +275,6 @@ class GNAPropagateMatcher : public GNATestConfigurability<GNAPropagateMatcher> {
         return *this;
     }
 
-
     GNAPropagateMatcher & once() {
         return times(1);
     }
@@ -356,7 +362,6 @@ class GNAPropagateMatcher : public GNATestConfigurability<GNAPropagateMatcher> {
         return *this;
     }
 
-
     GNAPropagateMatcher & affine_weights_transpozed(std::pair<int, int> &&transpozedArgs) {
         getMatcher().type = GnaPluginTestEnvironment::saveAffineWeights;
         _env.transposedArgsForSaving = std::move(transpozedArgs);
@@ -420,11 +425,6 @@ class GNAPropagateMatcher : public GNATestConfigurability<GNAPropagateMatcher> {
         return *this;
     }
 
-    GNAPropagateMatcher & onCPU() {
-        _env.config[GNA_CONFIG_KEY(DEVICE_MODE)] = GNA_CONFIG_VALUE(SW_FP32);
-        return *this;
-    }
-
  protected:
     void match();
     intel_nnet_type_t * original_nnet = nullptr;
@@ -513,6 +513,26 @@ class GNAQueryStateMatcher : public GNADumpXNNMatcher {
     void match();
 };
 
+/**
+ * @brief weights matcher has specific weights matching methods
+ */
+class GNAWeightsMatcher : public GNAPropagateMatcher {
+ public:
+    using base = GNAPropagateMatcher;
+    using base::base;
+
+    GNAWeightsMatcher & size() {
+        getMatcher().type = GnaPluginTestEnvironment::matchAffineWeightsSize;
+        return *this;
+    }
+    GNAWeightsMatcher & equals_to(size_t weights_size) {
+        if (getMatcher().type == GnaPluginTestEnvironment::matchAffineWeightsSize) {
+            _env.matched_weight_size = weights_size;
+        }
+        return *this;
+    }
+};
+
 
 
 /**
@@ -581,6 +601,16 @@ class GNATest : public U, public GNATestConfigurability<GNATest<U>>  {
         _env.model = _model;
         return *this;
     }
+    GNATest & afterLoadingModel(std::shared_ptr<ngraph::Function> ngraph_model) {
+        _env.ngraph_model = ngraph_model;
+        return *this;
+    }
+
+    GNAWeightsMatcher & affine_weights() {
+        returnedMatchers.push_back(std::make_shared<GNAWeightsMatcher>(_env));
+        _env = GnaPluginTestEnvironment();
+        return dynamic_cast<GNAWeightsMatcher&>(*returnedMatchers.back());
+    }
 
     GNAQueryStateMatcher & queryState() {
         returnedMatchers.push_back(std::make_shared<GNAQueryStateMatcher>(_env));
@@ -597,10 +627,12 @@ class GNATest : public U, public GNATestConfigurability<GNATest<U>>  {
         _env = GnaPluginTestEnvironment();
         return dynamic_cast<GNAPropagateMatcher&>(*returnedMatchers.back());
     }
+
     GNATest & importedFrom(std::string fileName) {
         _env.importedModelFileName = fileName;
         return *this;
     }
+
     GNATest & onInferModel(std::string _model = "",
                            std::function<void (InferenceEngine::CNNNetwork &)> _cb = [](InferenceEngine::CNNNetwork & net){}) {
         _env.model = _model;
diff --git a/inference-engine/tests_deprecated/unit/engines/gna/layers/gna_align_filter2_tests.cpp b/inference-engine/tests_deprecated/unit/engines/gna/layers/gna_align_filter2_tests.cpp
new file mode 100644 (file)
index 0000000..e12fecd
--- /dev/null
@@ -0,0 +1,184 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+#include <single_layer_common.hpp>
+#include <ngraph/op/parameter.hpp>
+#include <ngraph/ops.hpp>
+#include <ie_precision.hpp>
+#include "../gna_matcher.hpp"
+
+using GNAAlignFilterTestParams  = std::tuple<InferenceEngine::Precision, GNAPluginNS::Policy::ConcatAlignment, std::size_t, std::size_t>;
+using namespace GNAPluginNS;
+
+class GNAAlignFilterTest : public GNATest<>,
+                             public testing::WithParamInterface<GNAAlignFilterTestParams> {
+ public:
+
+    static std::string getTestName(const testing::TestParamInfo<GNAAlignFilterTestParams>& params) {
+        std::string test_name;
+        if (std::get<1>(params.param) == GNAPluginNS::Policy::ConcatAlignment::FAST) {
+            test_name += "fast_";
+        }
+        test_name += "concat_of(" + std::to_string(std::get<2>(params.param));
+        test_name += "_" + std::to_string(std::get<3>(params.param));
+        test_name += ")_on_";
+        test_name += std::get<0>(params.param).name();
+        return test_name;
+    }
+
+ protected:
+
+    InferenceEngine::Precision precision = InferenceEngine::Precision::FP32;
+    std::size_t concat_inputs[2];
+    GNAPluginNS::Policy::ConcatAlignment alignmentPolicy;
+
+    void SetUp() override {
+        std::tie(precision, alignmentPolicy, concat_inputs[0], concat_inputs[1]) = GetParam();
+    }
+
+    std::shared_ptr<ngraph::Function> getNgraphModel() {
+        auto input0 = std::make_shared<ngraph::op::Parameter>(ngraph::element::f32, ngraph::Shape{1, concat_inputs[0]});
+        auto input1 = std::make_shared<ngraph::op::Parameter>(ngraph::element::f32, ngraph::Shape{1, concat_inputs[1]});
+
+        auto relu0 = std::make_shared<ngraph::op::v0::Relu>(input0);
+        auto relu1 = std::make_shared<ngraph::op::v0::Relu>(input1);
+
+        auto concat = std::make_shared<ngraph::op::Concat>(ngraph::NodeVector{relu0, relu1}, 1);
+
+        auto relu3 = std::make_shared<ngraph::op::v0::Relu>(concat);
+
+        auto function = std::make_shared<ngraph::Function>(ngraph::NodeVector{relu3}, ngraph::ParameterVector{input0, input1});
+        return function;
+    }
+};
+
+TEST_P(GNAAlignFilterTest, concatWith_2_Inputs_Small_mem_footprint) {
+
+    auto ngraf = getNgraphModel();
+    if (precision == InferenceEngine::Precision::FP32) {
+        GTEST_SKIP() << "FP32 case - won't produce gna primitives";
+    }
+
+    // calc expected weight size
+    size_t expected_affine_size = 0;
+    size_t expected_copy_layers = 0;
+
+    auto getFastAffineFilterParams = [](size_t sz) -> std::pair<size_t, size_t> {
+        //align first input by 8
+        auto copy_N = sz > 32 ? 1 : 0; // number of copy layers
+        auto firstFilter_frac = sz % 32;
+        auto firstFilter_N = ALIGN(firstFilter_frac, 8);
+
+        return {copy_N, firstFilter_N   * firstFilter_frac};
+    };
+
+    auto getNumCopyElements = [&getFastAffineFilterParams](size_t sz) {
+        return getFastAffineFilterParams(sz).first;
+    };
+    auto getsNumFilterWeights = [&getFastAffineFilterParams](size_t sz) {
+        return getFastAffineFilterParams(sz).second;
+    };
+
+    switch(alignmentPolicy) {
+        case  Policy::ConcatAlignment::ENABLED : {
+            //align first input by 8
+            auto firstFilter = ALIGN(concat_inputs[0], 8) * concat_inputs[0];
+            //align first input by 8
+            auto extraLeftElementsForSecond = concat_inputs[0] + 32 - ALIGN(concat_inputs[0], 32);
+
+            auto secondFilter = ALIGN(concat_inputs[1], 8) * (extraLeftElementsForSecond + concat_inputs[1]);
+
+            expected_affine_size = firstFilter + secondFilter;
+            break;
+        }
+        case   Policy::ConcatAlignment::FAST  : {
+
+            expected_copy_layers = getNumCopyElements(concat_inputs[0]);
+            expected_affine_size = getsNumFilterWeights(concat_inputs[0]);
+
+            // calculation size for second filter
+            auto offset = ALIGN(concat_inputs[0], 32) - 32;
+            auto zerolen = concat_inputs[0] - offset;
+            auto second_output_len = zerolen + concat_inputs[1];
+
+            expected_affine_size += second_output_len  * ALIGN(concat_inputs[1], 8);
+            break;
+        }
+
+        default : {
+            FAIL() << "unsupported align policy: " << alignmentPolicy;
+        }
+    }
+
+    assert_that().onInferNgraphModel(ngraf)
+        .inNotCompactMode()
+        .withPolicy(alignmentPolicy)
+        .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR)) + "_0", 1.0f)
+        .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR)) + "_1", 1.0f)
+        .withGNAConfig(GNA_CONFIG_KEY(PRECISION), precision.name())
+        .gna()
+        .affine_weights()
+        .size()
+        .equals_to(expected_affine_size)
+        .And()
+        .copy_inserted_into_nnet()
+        .times(expected_copy_layers);
+}
+
+TEST_P(GNAAlignFilterTest, concatWith_2_Inputs_accurate) {
+    auto ngraf = getNgraphModel();
+    if (precision == InferenceEngine::Precision::FP32) {
+        std::vector<std::vector<float>> input_data;
+        float start_value = 1.0;
+
+        for (auto dim : concat_inputs) {
+            if (dim > 0) {
+                input_data.push_back(std::vector<float>(dim));
+
+                std::iota(input_data.back().begin(), input_data.back().end(), start_value);
+                start_value += dim;
+            }
+        }
+
+        std::vector<float> expected_result(static_cast<size_t>(start_value - 1));
+        start_value = 1.0;
+        std::iota(expected_result.begin(), expected_result.end(), start_value);
+        assert_that().onInferNgraphModel(ngraf)
+            .inNotCompactMode()
+            .gna()
+            .propagate_forward()
+            .onCPU()
+            .withPolicy(alignmentPolicy)
+            .called_with()
+            .input(ngraf->get_parameters().at(0)->get_name(), input_data[0])
+            .input(ngraf->get_parameters().at(1)->get_name(), input_data[1])
+            .equals_to(expected_result);
+    } else {
+        assert_that().onInferNgraphModel(ngraf)
+            .inNotCompactMode()
+            .gna()
+            .withPolicy(alignmentPolicy)
+            .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR)) + "_0", 1.0f)
+            .withGNAConfig(std::string(GNA_CONFIG_KEY(SCALE_FACTOR)) + "_1", 1.0f)
+            .withGNAConfig(GNA_CONFIG_KEY(PRECISION), "I16")
+            .propagate_forward()
+            .called();
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    GNALayerTests,
+    GNAAlignFilterTest,
+    testing::Combine(
+    testing::Values(InferenceEngine::Precision::FP32, InferenceEngine::Precision::I16),
+    //fast or not fast alignment policy
+    testing::Values(GNAPluginNS::Policy::ConcatAlignment::FAST, GNAPluginNS::Policy::ConcatAlignment::ENABLED),
+    // Size of first Split layer output
+    testing::Values(31, 49),
+    // Size of second Split layer output
+    testing::Values(31, 73)),
+    GNAAlignFilterTest::getTestName);
index a3f9d43..d75fe1c 100644 (file)
@@ -7,19 +7,36 @@
 class CopyLayerMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
     bool matchInserted;
     const int matchQuantity;
+    mutable int actualNumberOfCopyLayers;
  public:
     CopyLayerMatcher(bool matchInserted, int matchQuantity) : matchInserted(matchInserted), matchQuantity(matchQuantity) {}
     bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override {
         if (foo == nullptr)
             return false;
+        actualNumberOfCopyLayers = 0;
+
         for(int i = 0; i < foo->nLayers; i++) {
             if (foo->pLayers[i].nLayerKind != INTEL_COPY) continue;
-            return matchInserted;
+
+            if (!matchInserted) {
+                return false;
+            }
+            actualNumberOfCopyLayers ++;
+        }
+        if (matchQuantity == -1) {
+            if (actualNumberOfCopyLayers > 0) {
+                return true;
+            }
+            return false;
+        }
+        if (actualNumberOfCopyLayers != matchQuantity) {
+            return false;
         }
-        return !matchInserted;
+        return true;
     };
     void DescribeTo(::std::ostream *os) const override {
-        *os << "should "<< (matchInserted ? "" : "not ") << "have Copy primitive as part of nnet structure";
+        *os << "should "<< (matchInserted ? "" : "not ") << "have " << (matchInserted ? std::to_string(matchQuantity) : "" )
+            << " Copy primitives as part of nnet structure" << (matchInserted ? std::string(" but was only: ") + std::to_string(actualNumberOfCopyLayers) + " copy layers" : "" );
     }
 };
 
index 44e750c..2b18587 100644 (file)
@@ -104,7 +104,7 @@ class WeightsMatcher : public ::testing::MatcherInterface<const intel_nnet_type_
             auto affine = (intel_affine_func_t*)foo->pLayers[i].pLayerStruct;
 
             auto affineWeightsSize = foo->pLayers[i].nOutputRows *
-                foo->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL ? 1 : foo->pLayers[i].nInputRows;
+                (foo->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL ? 1 : foo->pLayers[i].nInputRows);
 
             if (affineWeightsSize != std::get<0>(transpozedData)->size()) {
                 error << "gna-xnn layer(" << i << ") weights size mismatch: expected "
@@ -136,6 +136,49 @@ class WeightsMatcher : public ::testing::MatcherInterface<const intel_nnet_type_
     }
 };
 
+class WeightsSizeMatcher : public ::testing::MatcherInterface<const intel_nnet_type_t*> {
+    enum HowMatch{
+        eNone,
+        eEqAffine,
+    } eMatchKind;
+
+    mutable std::stringstream error;
+    mutable int actual;
+    size_t expected_weights_size;
+ public:
+    explicit WeightsSizeMatcher(const size_t data_len) :
+        eMatchKind(eEqAffine),
+        expected_weights_size(data_len){
+    }
+    bool MatchAndExplain(const intel_nnet_type_t *foo, ::testing::MatchResultListener *listener) const override {
+        if (foo == nullptr)
+            return false;
+
+        size_t sizeTotal = 0;
+        std::stringstream ss;
+        for(int i = 0; i < foo->nLayers; i++) {
+            if (foo->pLayers[i].nLayerKind != INTEL_AFFINE && eMatchKind == eEqAffine) continue;
+
+            auto affineWeightsSize = foo->pLayers[i].nOutputRows *
+                (foo->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL ? 1 : foo->pLayers[i].nInputRows);
+
+            sizeTotal += affineWeightsSize;
+            ss << "[" << i << "]: " << affineWeightsSize << ", ";
+
+        }
+
+        if (eMatchKind == eEqAffine &&  sizeTotal != expected_weights_size) {
+            error << "gna-affine layers " << ss.str() << " have diff total weights size : " << sizeTotal
+                  << ", while expected to have: " << expected_weights_size << "\n";
+            return false;
+        }
+        return true;
+    };
+    void DescribeTo(::std::ostream *os) const override {
+        *os << error.str() << std::endl;
+    }
+};
+
 
 class WeightsSaver: public ::testing::MatcherInterface<const intel_nnet_type_t*> {
     mutable TranspozeIterator iterator;
@@ -182,3 +225,7 @@ void SaveWeights(std::unique_ptr<NNetComponentMatcher>& components,  std::vector
     components->add(new WeightsSaver(make_tuple(data, dims.first, dims.second)));
 }
 
+void HasWeightsSizeEq(std::unique_ptr<NNetComponentMatcher>& components,  size_t weights_size) {
+    components->add(new WeightsSizeMatcher(weights_size));
+}
+
index bf66e43..69b4c3b 100644 (file)
@@ -108,6 +108,7 @@ TEST(MKLDNNDumpTests, SerU8AsTxt) {
 
     std::string deser_header, ref_header = "U8 4D shape: 2 3 4 5 (120)";
     std::getline(buff, deser_header);
+    deser_header = deser_header.substr(0, ref_header.length());
     ASSERT_EQ(deser_header, ref_header);
 
     auto num_line = std::count(std::istreambuf_iterator<char>(buff),
@@ -129,6 +130,7 @@ TEST(MKLDNNDumpTests, SerAsTxt) {
 
     std::string deser_header, ref_header = "FP32 2D shape: 2 3 (6)";
     std::getline(buff, deser_header);
+    deser_header = deser_header.substr(0, ref_header.length());
     ASSERT_EQ(deser_header, ref_header);
 
     auto num_line = std::count(std::istreambuf_iterator<char>(buff),
index 5c72381..0670283 100644 (file)
@@ -187,7 +187,13 @@ protected:
     }
 };
 
-TEST_P(MKLDNNCPUExtScatterTFTests, TestsScatter) {}
+// Disabled these tests as they need to adjust with new specs:
+// - new Scatter Update layer: like TF scatter_update
+// - new Scatter Elements Update: like ONNX Scatter Elements
+// See merge requests:
+// DLDT #6005: Specification for the ScatterElementsUpdate layer
+// DLDT #6091: Specification for ScatterUpdate operation
+TEST_P(MKLDNNCPUExtScatterTFTests, DISABLED_TestsScatter) {}
 
 INSTANTIATE_TEST_CASE_P(
         TestsScatter, MKLDNNCPUExtScatterTFTests,
index 3103f98..c0232cb 100644 (file)
@@ -79,13 +79,13 @@ TEST_F(VPU_AdjustDataLocationTest, FlushCMX_TwoSpecialConsumers) {
 
     pipeline.run(model);
 
-    ASSERT_EQ(data1->location(), DataLocation::CMX);
+    ASSERT_EQ(data1->dataLocation().location, Location::CMX);
     ASSERT_EQ(data1->numConsumers(), 1);
 
     auto data1Consumer = data1->singleConsumer();
     auto data1ConsumerOutput = data1Consumer->output(0);
     ASSERT_EQ(data1Consumer->type(), StageType::Copy);
-    ASSERT_EQ(data1ConsumerOutput->location(), DataLocation::BSS);
+    ASSERT_EQ(data1ConsumerOutput->dataLocation().location, Location::BSS);
     ASSERT_EQ(data1ConsumerOutput->numChildDatas(), 4);
     ASSERT_TRUE(contains(data1ConsumerOutput->childDataEdges(), [data2](const SharedAllocation& e) { return e->child() == data2; }));
     ASSERT_TRUE(contains(data1ConsumerOutput->childDataEdges(), [data3](const SharedAllocation& e) { return e->child() == data3; }));
@@ -152,13 +152,13 @@ TEST_F(VPU_AdjustDataLocationTest, SpillWithBranch) {
     pipeline.run(model);
 
     auto hw1Output = hw1->output(0);
-    ASSERT_EQ(hw1Output->location(), DataLocation::CMX);
+    ASSERT_EQ(hw1Output->dataLocation().location, Location::CMX);
 
     auto copyStage = hw1Output->singleConsumer();
     ASSERT_EQ(copyStage->type(), StageType::Copy);
 
     auto copyStageOutput = copyStage->output(0);
-    ASSERT_EQ(copyStageOutput->location(), DataLocation::BSS);
+    ASSERT_EQ(copyStageOutput->dataLocation().location, Location::BSS);
 
     ASSERT_EQ(copyStageOutput->numConsumers(), 2);
     for (const auto& copyStageOutputConsumer : copyStageOutput->consumers()) {
index 72c0c26..dc666d0 100644 (file)
@@ -3,6 +3,8 @@
 //
 
 #include "graph_transformer_tests.hpp"
+#include <vpu/model/data_contents/replicated_data_content.hpp>
+
 #include <precision_utils.h>
 
 using namespace vpu;
@@ -19,8 +21,8 @@ TEST_F(VPU_EliminateConstConcatTest, EliminateCase_1D) {
 
     const auto model = CreateModel();
 
-    const auto constData1 = model->addConstData("const1", dataDesc1, replicateContent(1.0f, dataDesc1.totalDimSize()));
-    const auto constData2 = model->addConstData("const2", dataDesc2, replicateContent(2.0f, dataDesc2.totalDimSize()));
+    const auto constData1 = model->addConstData("const1", dataDesc1, replicateContent(1.0f, dataDesc1.totalDimSize(), dataDesc1));
+    const auto constData2 = model->addConstData("const2", dataDesc2, replicateContent(2.0f, dataDesc2.totalDimSize(), dataDesc2));
 
     const auto concatData = model->addNewData("concat", dataDescConcat);
 
@@ -74,8 +76,8 @@ TEST_F(VPU_EliminateConstConcatTest, EliminateCase_2D) {
 
     const auto model = CreateModel();
 
-    const auto constData1 = model->addConstData("const1", dataDesc1, replicateContent(1.0f, dataDesc1.totalDimSize()));
-    const auto constData2 = model->addConstData("const2", dataDesc2, replicateContent(2.0f, dataDesc2.totalDimSize()));
+    const auto constData1 = model->addConstData("const1", dataDesc1, replicateContent(1.0f, dataDesc1.totalDimSize(), dataDesc1));
+    const auto constData2 = model->addConstData("const2", dataDesc2, replicateContent(2.0f, dataDesc2.totalDimSize(), dataDesc2));
 
     const auto concatData = model->addNewData("concat", dataDescConcat);
 
index f1e0bf6..6ecf86a 100644 (file)
@@ -2,10 +2,12 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-#include <initializer_list>
+#include "graph_transformer_tests.hpp"
+
 #include <vpu/stages/stub_stage.hpp>
+#include <vpu/model/data_contents/ie_blob_content.hpp>
 
-#include "graph_transformer_tests.hpp"
+#include <initializer_list>
 
 using namespace vpu;
 
index 9ddf0b1..65379f3 100644 (file)
@@ -5,6 +5,7 @@
 #include <vpu/stages/stub_stage.hpp>
 
 #include "graph_transformer_tests.hpp"
+#include "vpu/model/data_contents/ie_blob_content.hpp"
 
 using namespace vpu;
 
index 7fbc02c..2061103 100644 (file)
@@ -152,12 +152,6 @@ TEST_F(CNNNGraphImplTests, TestSetBatch) {
 
     ASSERT_EQ(2, cnnNet.getBatchSize());
     ASSERT_EQ(2, cnnNet.getCNNNetwork()->getBatchSize());
-
-    auto cnnNet2 = cnnNet.cloneNGraphImpl();
-
-    ASSERT_EQ(2, cnnNet2->getBatchSize());
-    ASSERT_EQ(2, cnnNet2->getCNNNetwork()->getBatchSize());
-    ASSERT_NE(cnnRefNet, cnnNet2->getCNNNetwork());
 }
 
 TEST_F(CNNNGraphImplTests, TestSaveAffinity) {
index d7f27c6..1f5e0ef 100644 (file)
@@ -7,6 +7,7 @@
 #include "tests_common.hpp"
 
 #include <convert_function_to_cnn_network.hpp>
+#include <cpp/ie_cnn_network.h>
 
 #include <ngraph/function.hpp>
 #include <ngraph/opsets/opset1.hpp>
@@ -30,7 +31,7 @@ TEST_F(ConvertFunctionToCNNNetworkTests, ConvertPReLUNetwork) {
                                                ngraph::ParameterVector{param1, param2});
     }
 
-    InferenceEngine::details::CNNNetworkNGraphImpl nGraphImpl(f);
+    InferenceEngine::CNNNetwork nGraphImpl(f);
     try {
         auto net = InferenceEngine::details::convertFunctionToICNNNetwork(f, nGraphImpl);
         FAIL();
@@ -59,10 +60,10 @@ TEST_F(ConvertFunctionToCNNNetworkTests, ConvertConvolutionNetwork) {
                                                ngraph::ParameterVector{param1, param2});
     }
 
-    InferenceEngine::details::CNNNetworkNGraphImpl nGraphImpl(f);
+    InferenceEngine::CNNNetwork nGraphImpl(f);
     try {
         auto net = InferenceEngine::details::convertFunctionToICNNNetwork(f, nGraphImpl);
     } catch (InferenceEngine::details::InferenceEngineException &err) {
         FAIL();
     }
-}
\ No newline at end of file
+}
index 8afc781..010351c 100644 (file)
@@ -95,8 +95,7 @@ static const auto model = R"_(
 )_";
 
 TEST(NetworkSerializerTest, TopoSortResultUnique) {
-
-    auto reader = std::shared_ptr<InferenceEngine::ICNNNetReader>(InferenceEngine::CreateCNNNetReader());
+    auto reader = InferenceEngine::CreateCNNNetReaderPtr();
 
     InferenceEngine::ResponseDesc resp;
 
diff --git a/inference-engine/tests_deprecated/unit/inference_engine_tests/pointer_test.cpp b/inference-engine/tests_deprecated/unit/inference_engine_tests/pointer_test.cpp
deleted file mode 100644 (file)
index 8af6f72..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include <gtest/gtest.h>
-#include "cpp/ie_cnn_net_reader.h"
-
-using namespace InferenceEngine;
-
-class PointerTests : public ::testing::Test {};
-
-TEST_F(PointerTests, InferenceEnginePtrStoresValues) {
-    std::shared_ptr <ICNNNetReader> p(InferenceEngine::CreateCNNNetReader());
-    ASSERT_NE(p.get(), nullptr);
-}
index da771e7..f610e9f 100644 (file)
@@ -151,7 +151,8 @@ TEST_F(V2TopologyVerificationTests, testCheckConvolutionInputDim_More) {
     string testContent = getNetworkWithConvLayer("Q78", { 1, 1, 3, 227, 227 });
 
     xmlHelper->loadContent(testContent);
-    EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+    // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+    EXPECT_THROW(xmlHelper->parse(), std::exception);
 }
 
 //convolution input must be 4D
@@ -159,27 +160,31 @@ TEST_F(V2TopologyVerificationTests, testCheckConvolutionInputDim_Less) {
     string testContent = getNetworkWithConvLayer("Q78", { 227, 227 });
 
     xmlHelper->loadContent(testContent);
-    EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+    // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+    EXPECT_THROW(xmlHelper->parse(), std::exception);
 }
 
 //pooling input must be 4D
 TEST_F(V2TopologyVerificationTests, testCheckPoolingInputDim_Less) {
     string testContent = getNetworkWithPoolLayer({ 227, 227 });
     xmlHelper->loadContent(testContent);
-    EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+    // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+    EXPECT_THROW(xmlHelper->parse(), std::exception);
 }
 
 //pooling input must be 4D
 TEST_F(V2TopologyVerificationTests, testCheckPoolingInputDim_More) {
     string testContent = getNetworkWithPoolLayer({ 1, 1, 3, 227, 227 });
     xmlHelper->loadContent(testContent);
-    EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+    // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+    EXPECT_THROW(xmlHelper->parse(), std::exception);
 }
 
 TEST_F(V2TopologyVerificationTests, testLeayerPrecisionIsNotMIXED) {
     string testContent = getNetworkWithConvLayer("MIXED");
     xmlHelper->loadContent(testContent);
-    EXPECT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+    // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+    EXPECT_THROW(xmlHelper->parse(), std::exception);
 }
 
 TEST_F(V2TopologyVerificationTests, testMixedPrecisionIfLayerAndNetworkPrecisionsDiffer) {
@@ -198,7 +203,9 @@ TEST_F(V2TopologyVerificationTests, throwsIfCropDimIsTooBig) {
 
     string testContent = getNetworkWithCropLayer({ data });
     xmlHelper->loadContent(testContent);
-    ASSERT_THROW(xmlHelper->parse(), InferenceEngine::details::InferenceEngineException);
+    
+    // TODO: fix RTTI issue and replace by InferenceEngine::details::InferenceEngineException
+    ASSERT_THROW(xmlHelper->parse(), std::exception);
 }
 
 TEST_F(V2TopologyVerificationTests, testNoThrowWithProperCropParameters) {
index db17b8e..af2c407 100644 (file)
@@ -420,6 +420,8 @@ struct layout {
         } else if (this->format == cldnn::format::os_is_yx_osv32_isv32p) {
             sizes[0] = align_to(sizes[0], 32);
             sizes[1] = align_to(sizes[1], 32);
+        } else if (this->format == cldnn::format::image_2d_rgba) {
+            sizes[1] = 4;
         }
         size_t total = std::accumulate(
             sizes.begin(),
index d16d210..30650ee 100644 (file)
@@ -121,12 +121,14 @@ struct format {
         b_fs_yx_32fp,                           ///< format for data for binary convolutions
         winograd_2x3_s1_data,                   ///< format used for input for winograd convolution, F(2,3) -- filter 3x3 with stride 1
         nv12,                                   ///< format for media nv12 input
+        image_2d_rgba,                          ///< format for image2d RGBA, always allocates memory for 4 feature maps (even when only 3 are used)
 
         // Weights formats
         oiyx,                                         ///< the most common format for 2D weights
         yxio,                                         ///< format used 2D weights
         oizyx,                                        ///< the most common format for 3D convolution
         os_iyx_osv16,                                 ///< format used only for convolution weights:
+        os_is_yx_osv16_isv16,                               ///< format used for convolution i8 weights
         os_zyxi_osv16,                                ///< format used for weights for 3D convolution
         os_is_yx_isv16_osv16,                         ///< format used for blocked convolution
         os_is_zyx_isv16_osv16,                        ///< format used for weights for blocked 3D convolution
@@ -183,6 +185,7 @@ struct format {
         gs_oiyx_gsv16,                                ///< format used for weights for 2D convolution
         gs_oiyx_gsv32,                                ///< format used for weights for 2D convolution
         g_is_os_zyx_osv16_isv16,                      ///< format used for grouped weights for blocked 3D deconvolution
+        g_os_is_yx_osv16_isv4,
         g_is_os_yx_osv16_isv16,
         g_os_is_zyx_isv8_osv16_isv2,
         g_os_is_yx_isv8_osv16_isv2,
@@ -228,6 +231,7 @@ struct format {
                 { bs_fs_zyx_bsv16_fsv16, { 1, 1, 3, 0, 0, "bfzyx",  "bfxyz",  {{0, 16 }, {1, 16}}}},
                 { bs_fs_yx_bsv16_fsv16,  { 1, 1, 3, 0, 0, "bfyx",   "bfxy?",  {{0, 16 }, {1, 16}}}},
                 { nv12,                  { 1, 1, 2, 0, 0, "bfyx",   "bfxy?",  {}}},
+                { image_2d_rgba,         { 1, 1, 2, 0, 0, "bfyx",   "bfxy?",  {}}},
 
                 { oiyx,                                        { 1, 1, 2, 0, 0, "bfyx",   "bfxy",       {}}},
                 { yxio,                                        { 1, 1, 2, 0, 0, "yxfb",   "bfxy?",      {}}},
@@ -262,6 +266,7 @@ struct format {
                 { os_is_zyx_isv8_osv16_isv2,                   { 1, 1, 3, 0, 0, "bfzyx",  "bfxyz",      {{1, 8}, {0, 16}, {1, 2}}}},
                 { os_zyxi_osv16,                               { 1, 1, 3, 0, 0, "bzyxf",  "bfxyz",      {{0, 16}}}},
                 { os_is_yx_isv8_osv16_isv2,                    { 1, 1, 2, 0, 0, "bfzyx",  "bfxyz",      {{1, 8}, {0, 16}, {1, 2}}}},
+                { os_is_yx_osv16_isv16,                        { 1, 1, 2, 0, 0, "bfyx",   "bfxy",       {{1, 16}, {0, 16}}}},
 
                 { goiyx,                                       { 1, 1, 2, 0, 1, "gbfyx",  "bfxy????g",  {}}},
                 { goizyx,                                      { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g",  {}}},
@@ -274,7 +279,8 @@ struct format {
                 { g_is_os_yx_osv16_isv16,                      { 1, 1, 2, 0, 1, "gfbyx",  "bfxy????g",  {{0, 16}, {1, 16}}}},
                 { g_os_is_zyx_isv8_osv16_isv2,                 { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g",  {{1, 8}, {0, 16}, {1, 2}}}},
                 { g_os_is_yx_isv8_osv16_isv2,                  { 1, 1, 2, 0, 1, "gbfyx",  "bfxy????g",  {{1, 8}, {0, 16}, {1, 2}}}},
-                { g_os_is_zyx_isv16_osv16,                     { 1, 1, 3, 0, 1, "bfzyx",  "bfxyz???g",  {{0, 16}, {1, 16}}}},
+                { g_os_is_zyx_isv16_osv16,                     { 1, 1, 3, 0, 1, "gbfzyx", "bfxyz???g",  {{0, 16}, {1, 16}}}},
+                { g_os_is_yx_osv16_isv4,                       { 1, 1, 2, 0, 1, "gbfxy",  "bfxy????g",  {{0, 16}, {1, 4}}}},
         };
         return traits.at(fmt);
     }
@@ -311,7 +317,8 @@ struct format {
                 fmt == image_2d_weights_c1_b_fyx ||
                 fmt == image_2d_weights_winograd_6x3_s1_fbxyb ||
                 fmt == image_2d_weights_winograd_6x3_s1_xfbyb ||
-                fmt == nv12);
+                fmt == nv12 ||
+                fmt == image_2d_rgba);
     }
     /// @brief Checks if @p format is of grouped type
     static bool is_grouped(type fmt) { return group_num(fmt) != 0; }
index a2f9d20..f7528c4 100644 (file)
@@ -177,6 +177,7 @@ struct fused_conv_eltwise : public primitive_base<fused_conv_eltwise> {
 
     /// @brief Is optimization that output contains data from second input ON ?
     bool second_input_in_output = false;
+    bool depth_to_space_already_fused = false;
 
 protected:
     const primitive_id_arr conv_weights;
index 8f668b1..c67c817 100644 (file)
@@ -520,7 +520,7 @@ typedef CL_API_ENTRY cl_mem(CL_API_CALL * PFN_clCreateFromMediaSurfaceINTEL)(
                 return detail::errHandler(CL_INVALID_ARG_VALUE, fname);
             }
 
-            static PFN_clGetDeviceIDsFromMediaAdapterINTEL pfn_clGetDeviceIDsFromMediaAdapterINTEL = NULL;
+            PFN_clGetDeviceIDsFromMediaAdapterINTEL pfn_clGetDeviceIDsFromMediaAdapterINTEL = NULL;
             if (!pfn_clGetDeviceIDsFromMediaAdapterINTEL) {
                 pfn_clGetDeviceIDsFromMediaAdapterINTEL =
                     reinterpret_cast<PFN_clGetDeviceIDsFromMediaAdapterINTEL>
@@ -540,44 +540,46 @@ typedef CL_API_ENTRY cl_mem(CL_API_CALL * PFN_clCreateFromMediaSurfaceINTEL)(
                 0,
                 NULL,
                 &n);
-            if (err != CL_SUCCESS) {
+            if (err != CL_SUCCESS && err != CL_DEVICE_NOT_FOUND) {
                 return detail::errHandler(err, fname);
             }
 
-            vector<cl_device_id> ids(n);
-            err = pfn_clGetDeviceIDsFromMediaAdapterINTEL(
-                object_,
-                media_adapter_type,
-                media_adapter,
-                media_adapter_set,
-                n,
-                ids.data(),
-                NULL);
-            if (err != CL_SUCCESS) {
-                return detail::errHandler(err, fname);
-            }
+            if (err != CL_DEVICE_NOT_FOUND)
+            {
+                vector<cl_device_id> ids(n);
+                err = pfn_clGetDeviceIDsFromMediaAdapterINTEL(
+                    object_,
+                    media_adapter_type,
+                    media_adapter,
+                    media_adapter_set,
+                    n,
+                    ids.data(),
+                    NULL);
+                if (err != CL_SUCCESS) {
+                    return detail::errHandler(err, fname);
+                }
 
-            // Cannot trivially assign because we need to capture intermediates
-            // with safe construction
-            // We must retain things we obtain from the API to avoid releasing
-            // API-owned objects.
-            if (devices) {
-                devices->resize(ids.size());
-
-                // Assign to param, constructing with retain behaviour
-                // to correctly capture each underlying CL object
-                for (size_type i = 0; i < ids.size(); i++) {
-                    (*devices)[i] = Device(ids[i], true);
+                // Cannot trivially assign because we need to capture intermediates
+                // with safe construction
+                // We must retain things we obtain from the API to avoid releasing
+                // API-owned objects.
+                if (devices) {
+                    devices->resize(ids.size());
+
+                    // Assign to param, constructing with retain behaviour
+                    // to correctly capture each underlying CL object
+                    for (size_type i = 0; i < ids.size(); i++) {
+                        (*devices)[i] = Device(ids[i], true);
+                    }
                 }
-            }
 
-            // set up acquire/release extensions
-            SharedSurfLock::Init(object_);
-            ImageVA::Init(object_);
+                // set up acquire/release extensions
+                SharedSurfLock::Init(object_);
+                ImageVA::Init(object_);
 #ifdef WIN32
-            BufferDX::Init(object_);
+                BufferDX::Init(object_);
 #endif
-
+            }
             return CL_SUCCESS;
         }
     };
index 9cd2c4f..a8028ca 100644 (file)
@@ -53,6 +53,7 @@ DataTensor::DataChannelArray DataTensor::dataChannelArray {{
     { DataLayout::b_fs_yx_32fp,          {  0,  1, -1, -1,  2,  3 } },
     { DataLayout::bfwzyx,                {  0,  1,  2,  3,  4,  5 } },
     { DataLayout::nv12,                  {  0,  1, -1, -1,  2,  3 } },
+    { DataLayout::image_2d_rgba,         {  0,  1, -1, -1,  2,  3 } },
 }};
 
 WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
@@ -71,6 +72,7 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
     { WeightsLayout::os_i_osv8__ai8,                              { -1, -1, -1,   0,   1, -1, -1, -1 } },
     { WeightsLayout::os_i_osv16__ai8,                             { -1, -1, -1,   0,   1, -1, -1, -1 } },
     { WeightsLayout::os_i_osv16,                                  { -1, -1, -1,   0,   1, -1, -1, -1 } },
+    { WeightsLayout::os_is_yx_osv16_isv16,                        {  0,  1, -1,   2,   3, -1, -1, -1 } },
     { WeightsLayout::i_yxs_os_yxsv2_osv16,                        {  1,  2, -1,   3,   0, -1, -1, -1 } },
     { WeightsLayout::iy_xs_os_xsv2_osv16__ao32,                   {  1,  2, -1,   3,   0, -1, -1, -1 } },
     { WeightsLayout::iy_xs_os_xsv2_osv8__ao32,                    {  1,  2, -1,   3,   0, -1, -1, -1 } },
@@ -104,6 +106,7 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
     { WeightsLayout::os_is_yx_isv8_osv16_isv2,                    {  0,  1, -1,   2,   3, -1, -1, -1 } },
     { WeightsLayout::os_zyxi_osv16,                               {  1,  2,  3,   0,   4, -1, -1, -1 } },
     { WeightsLayout::os_i_yxs_osv4_yxsv4,                         {  0,  1, -1,   2,   3, -1, -1, -1 } },
+    { WeightsLayout::is_os_yx_osv16_isv16,                        {  0,  1, -1,   3,   2, -1, -1, -1 } },
     { WeightsLayout::goiyx,                                       {  0,  1, -1,   2,   3, -1, -1,  4 } },
     { WeightsLayout::goizyx,                                      {  0,  1,  2,   3,   4, -1, -1,  5 } },
     { WeightsLayout::g_os_iyx_osv16,                              {  0,  1, -1,   2,   3, -1, -1,  4 } },
@@ -120,8 +123,11 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{
     { WeightsLayout::g_os_is_zyx_isv16_osv16,                     {  0,  1,  2,   3,   4, -1, -1,  5 } },
     { WeightsLayout::giy_xs_os_xsv2_osv16__ao32,                  {  1,  2, -1,   3,   0, -1, -1,  4 } },
     { WeightsLayout::giy_xs_os_xsv2_osv8__ao32,                   {  1,  2, -1,   3,   0, -1, -1,  4 } },
-    { WeightsLayout::gs_oi_yxs_gsv4_yxsv4,                        {  0,  1, -1,   2,   3, -1, -1,  4 } },
     { WeightsLayout::g_os_is_yx_isv16_osv16,                      {  0,  1, -1,   2,   3, -1, -1,  4 } },
+    { WeightsLayout::gs_oi_yxs_gsv4_yxsv4,                        {  0,  1, -1,   2,   3, -1, -1,  4 } },
+    { WeightsLayout::gs_oi_yxs_gsv16_yxsv4,                       {  0,  1, -1,   2,   3, -1, -1,  4 } },
+    { WeightsLayout::gs_oi_yxs_gsv32_yxsv4,                       {  0,  1, -1,   2,   3, -1, -1,  4 } },
+    { WeightsLayout::g_os_is_yx_osv16_isv4,                       {  0,  1, -1,   2,   3, -1, -1,  4 } },
 }};
 
 NDims DataTensor::GetSimpleDims(const std::vector<size_t>& d, DataLayout l) {
@@ -614,6 +620,22 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
         case gs_oi_yxs_gsv4_yxsv4:
             newDims[4] = RoundUp(newDims[4], 4);
             break;
+        case os_is_yx_osv16_isv16:
+            assert(newDims.size() == 4);
+            newDims[2] = RoundUp(newDims[2], 16);
+            newDims[3] = RoundUp(newDims[3], 16);
+            break;
+        case gs_oi_yxs_gsv16_yxsv4:
+            newDims[4] = RoundUp(newDims[4], 16);
+            break;
+        case gs_oi_yxs_gsv32_yxsv4:
+            newDims[4] = RoundUp(newDims[4], 32);
+            break;
+        case g_os_is_yx_osv16_isv4:
+            assert(newDims.size() == 5);
+            newDims[2] = RoundUp(newDims[2], 4);
+            newDims[3] = RoundUp(newDims[3], 16);
+            break;
         default:
             break;
     }
@@ -658,8 +680,13 @@ NDims WeightsTensor::GetSimpleDims(const std::vector<size_t>& d, WeightsLayout l
     } else if (l == gs_oi_yxs_gsv4_yxsv4) {
         ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 4;
         ret[4].pitch = ret[3].v * RoundUp(ret[0].v * ret[1].v, 4);
+    } else if (l == gs_oi_yxs_gsv16_yxsv4) {
+        ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 16;
+        ret[4].pitch = ret[3].v * RoundUp(ret[0].v * ret[1].v, 4);
+    } else if (l == gs_oi_yxs_gsv32_yxsv4) {
+        ret[2].pitch = RoundUp(ret[0].v * ret[1].v, 4) * 32;
+        ret[4].pitch = ret[3].v * RoundUp(ret[0].v * ret[1].v, 4);
     }
-
     return ret;
 }
 
index 7f05b67..05979ae 100644 (file)
@@ -61,6 +61,7 @@ enum DataLayout {
     b_fs_yx_32fp,           // bfyx with blocks of 16 packed binary input channels
     bfwzyx,                 // batch, feature, 4D spatial
     nv12,                   // media nv12 layout
+    image_2d_rgba,          // image2d RGBA
     DataLayoutCount         // NUMBER OF ELEMENTS IN ENUM
 };
 
@@ -89,6 +90,7 @@ enum WeightsLayout {
     os_i_osv8__ai8,  // TODO can we drop the alignment form layout name?
     os_i_osv16__ai8,
     os_i_osv16,
+    os_is_yx_osv16_isv16,           // wieghts for int8 blocked conv
     i_yxs_os_yxsv2_osv16,
     iy_xs_os_xsv2_osv16__ao32,
     iy_xs_os_xsv2_osv8__ao32,
@@ -138,8 +140,12 @@ enum WeightsLayout {
     g_os_is_zyx_isv16_osv16,
     giy_xs_os_xsv2_osv16__ao32,
     giy_xs_os_xsv2_osv8__ao32,
-    gs_oi_yxs_gsv4_yxsv4,                // grouped weights for depthwise IMAD convolution
     g_os_is_yx_isv16_osv16,
+    gs_oi_yxs_gsv4_yxsv4,                // grouped weights for depthwise IMAD convolution (b_fs_yx_fsv4 format)
+    gs_oi_yxs_gsv16_yxsv4,               // grouped weights for depthwise IMAD convolution (b_fs_yx_fsv16 format)
+    gs_oi_yxs_gsv32_yxsv4,               // grouped weights for depthwise IMAD convolution (b_fs_yx_fsv32 format)
+
+    g_os_is_yx_osv16_isv4,
     WeightsLayoutCount                   // NUMBER OF ELEMENTS IN ENUM
 };
 
@@ -225,6 +231,7 @@ inline bool GroupedLayout(WeightsLayout l) {
         case WeightsLayout::giy_xs_os_xsv2_osv16__ao32:
         case WeightsLayout::giy_xs_os_xsv2_osv8__ao32:
         case WeightsLayout::gs_oi_yxs_gsv4_yxsv4:
+        case WeightsLayout::g_os_is_yx_osv16_isv4:
             return true;
         default:
             return false;
index 6394086..f10d56c 100644 (file)
@@ -59,6 +59,7 @@ ParamsKey ConcatenationKernel_simple_Ref::GetSupportedKey() const {
     k.EnableConcatAxis(ConcatAxis::FEATURE);
     k.EnableConcatAxis(ConcatAxis::BATCH);
     k.EnableConcatKernelPerInput();
+    k.EnableDifferentTypes();
     return k;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.cpp
new file mode 100644 (file)
index 0000000..1362af8
--- /dev/null
@@ -0,0 +1,241 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+//
+// Kernel specific constants
+//
+#define SIMD_SIZE 16
+
+namespace kernel_selector {
+
+namespace {
+
+size_t getOutBlock_X(size_t output_size_x, size_t stride_x) {
+    size_t output_block_width = 0;
+    size_t max_block_size = std::min((SIMD_SIZE - 1) / stride_x + 1, output_size_x);
+
+    if (output_size_x <= max_block_size)
+        return output_size_x;
+
+    for (size_t block = 4; block <= max_block_size; ++block) {
+        if (output_size_x % block == 0)
+            output_block_width = block;
+    }
+    if (output_block_width == 0 && output_size_x < max_block_size * 3) {
+        size_t min_overhang = max_block_size;
+        for (size_t block = 4; block <= max_block_size; ++block) {
+            size_t overhang = block - output_size_x % block;
+            if (overhang <= min_overhang) {
+                min_overhang = overhang;
+                output_block_width = block;
+            }
+        }
+    }
+
+    if (output_block_width == 0) {
+        output_block_width = max_block_size;
+    }
+    return output_block_width;
+}
+
+bool should_k_slice(const convolution_params& params, size_t output_block_width) {
+    constexpr float preferred_eu_occupancy = 5.f;
+    if (params.inputs[0].Feature().v % (16 * 4) != 0)
+        return false;
+
+    size_t eu_count = params.engineInfo.computeUnitsCount;
+    auto global_size = CeilDiv(params.output.X().v, output_block_width) *
+        params.output.Y().v *
+        params.output.Batch().v * Align(CeilDiv(params.output.Feature().v, 2), SIMD_SIZE);
+    auto threads = global_size / SIMD_SIZE;
+    auto optimal_threads_num = eu_count * preferred_eu_occupancy;
+    return threads < optimal_threads_num;
+}
+
+}  // namespace
+
+Convolution_kernel_b_fs_yx_fsv16_imad_1x1::Convolution_kernel_b_fs_yx_fsv16_imad_1x1()
+    : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad_1x1") {
+    for (size_t bw = 1; bw <= SIMD_SIZE; ++bw) {
+        for (auto exe : ConvolutionKernelBase::autoTuneOptions) {
+            all_tune_params.push_back(AutoTuneParams{ bw, true, exe });
+            all_tune_params.push_back(AutoTuneParams{ bw, false, exe });
+        }
+    }
+}
+
+ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+
+    k.EnableInputWeightsType(WeightsType::INT8);
+
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+
+    k.EnableDifferentTypes();
+    k.EnableDifferentInputWeightsTypes();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBiasPerFeature();
+    k.EnableNonBiasTerm();
+    k.EnableBatching();
+    k.EnableQuantization(QuantizationType::SYMMETRIC);
+    return k;
+}
+
+JitConstants Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetJitConstants(const convolution_params& params,
+                                                                        const DispatchData& kd) const {
+    auto mem_consts = Parent::GetJitConstants(params, kd);
+    mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", kd.cldnnStyle.blockWidth));
+    mem_consts.AddConstant(MakeJitConstant("FEATURE_LWS_SPLIT", kd.cldnnStyle.prefetch));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf_scalar = {"", {"out_b", "out_f + out_f_offset", "out_y", "out_x + i"}, "dequantized", input_dt, 1 };
+        conf_scalar.SetLoopAxes({ Tensor::DataChannelName::X }, true);
+        mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
+    }
+
+    return mem_consts;
+}  // GetJitConstants
+
+ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::SetDefault(const convolution_params& params,
+                                                                                          int index) const {
+    DispatchData kd;
+    const auto& output = params.output;
+    auto tune_params = GetAutoTuneParams(params, index);
+    size_t k_slices = tune_params.k_slicing ? 4 : 1;
+
+    kd.gws0 = CeilDiv(output.X().v, tune_params.out_block_width);
+    kd.gws1 = output.Y().v;
+    kd.gws2 = output.Batch().v * Align(CeilDiv(output.Feature().v, 2), SIMD_SIZE) * k_slices;
+
+    kd.lws0 = 1;
+    kd.lws1 = 1;
+    kd.lws2 = SIMD_SIZE * k_slices;
+
+    kd.cldnnStyle = {0, 0, 0, 0, 0};
+    kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+
+    kd.cldnnStyle.blockWidth = tune_params.out_block_width;
+    kd.cldnnStyle.prefetch = k_slices;
+
+    kd.efficiency = FORCE_PRIORITY_2;
+
+    return kd;
+}  // SetDefault
+
+bool Convolution_kernel_b_fs_yx_fsv16_imad_1x1::Validate(const Params& params, const optional_params& options) const {
+    if (!Parent::Validate(params, options)) {
+        return false;
+    }
+
+    KernelData kd = KernelData::Default<convolution_params>(params);
+    convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
+
+    if ((newParams.filterSize.x != newParams.filterSize.y) ||
+        newParams.filterSize.x != 1) {
+        // Fitler size needs to be 1x1
+        return false;
+    }
+
+    if ((newParams.stride.x != newParams.stride.y) ||
+        (newParams.stride.x != 1 && newParams.stride.x != 2)) {
+        // Strides must be 1x1 or 2x2
+        return false;
+    }
+
+    if (newParams.groups != 1 || newParams.split != 1)
+        return false;
+
+    return true;
+}
+
+Convolution_kernel_b_fs_yx_fsv16_imad_1x1::AutoTuneParams
+Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetAutoTuneParams(const convolution_params& params, int index) const {
+    if (index >= 0 && index < static_cast<int>(all_tune_params.size())) {
+        return all_tune_params[index];
+    }
+    AutoTuneParams default_params;
+    default_params.out_block_width = getOutBlock_X(params.output.X().v, params.stride.x);
+    default_params.k_slicing = should_k_slice(params, default_params.out_block_width);
+    default_params.exe_mode = DEFAULT;
+    return default_params;
+}
+
+bool Convolution_kernel_b_fs_yx_fsv16_imad_1x1::ValidateAutoTuneParams(const convolution_params& params,
+                                                                       const AutoTuneParams& tune_params) const {
+    if (tune_params.k_slicing && params.inputs[0].Feature().v % (16 * 4) != 0)
+        return false;
+
+    size_t max_block_size = std::min(static_cast<size_t>((SIMD_SIZE - 1) / params.stride.x + 1), params.output.X().v);
+    if (tune_params.out_block_width > max_block_size)
+        return false;
+
+    return true;
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetKernelsData(const Params& params,
+                                                                      const optional_params& options) const {
+    return GetTunedKernelsDataByIndex(params, options);
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetTunedKernelsDataByIndex(const Params & params,
+                                                                                  const optional_params & options,
+                                                                                  int autoTuneIndex) const {
+    auto conv_params = static_cast<const convolution_params&>(params);
+    auto tune_params = GetAutoTuneParams(conv_params, autoTuneIndex);
+    if (!ValidateAutoTuneParams(conv_params, tune_params))
+        return {};
+    return GetCommonKernelsData(params, options, tune_params.exe_mode, autoTuneIndex);
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetKernelsDataForAutoTune(const Params & params,
+                                                                                 const optional_params & options) const {
+    if (!Validate(params, options)) {
+        return {};
+    }
+    auto& conv_params = static_cast<const convolution_params&>(params);
+
+    KernelsData res = {};
+
+    for (size_t i = 0; i < all_tune_params.size(); i++) {
+        auto tune_params = GetAutoTuneParams(conv_params, static_cast<int>(i));
+        if (!ValidateAutoTuneParams(conv_params, tune_params))
+            continue;
+        KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
+        if (!kd.empty()) {
+            res.emplace_back(kd[0]);
+        }
+    }
+
+    return res;
+}
+
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_1x1.h
new file mode 100644 (file)
index 0000000..7133d2d
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+#include <string>
+
+namespace kernel_selector {
+
+class Convolution_kernel_b_fs_yx_fsv16_imad_1x1 : public ConvolutionKernelBase {
+public:
+    using Parent = ConvolutionKernelBase;
+    Convolution_kernel_b_fs_yx_fsv16_imad_1x1();
+    virtual ~Convolution_kernel_b_fs_yx_fsv16_imad_1x1() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    KernelsData GetKernelsDataForAutoTune(const Params & params, const optional_params & options) const override;
+    KernelsData GetTunedKernelsDataByIndex(const Params & params, const optional_params & options, int autoTuneIndex = -1) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& params, const optional_params& options) const override;
+    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+    DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+    bool NeedPaddedInput() const override { return true; }
+    WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+        return WeightsLayout::os_is_yx_osv16_isv16;
+    }
+
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::ELTWISE,
+                 FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
+
+    struct AutoTuneParams {
+        size_t out_block_width;
+        bool k_slicing;
+        std::string exe_mode;
+    };
+    std::vector<AutoTuneParams> all_tune_params;
+
+    bool ValidateAutoTuneParams(const convolution_params& params, const AutoTuneParams& tune_params) const;
+    AutoTuneParams GetAutoTuneParams(const convolution_params& params, int index) const;
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3.cpp
new file mode 100644 (file)
index 0000000..a1aef4f
--- /dev/null
@@ -0,0 +1,165 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "convolution_kernel_b_fs_yx_fsv16_imad_3x3.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <vector>
+#include <iostream>
+
+//
+// Kernel specific constants
+//
+#define SIMD_SIZE 16
+
+static size_t getOutBlock_X(const size_t output_size_x, const size_t stride_x, const size_t filter_size_x) {
+    size_t output_block_width = 0;
+    size_t max_block_size = std::min((SIMD_SIZE - filter_size_x) / stride_x + 1, output_size_x);
+
+    if (output_size_x <= max_block_size)
+        return output_size_x;
+
+    for (size_t block = 4; block <= max_block_size; ++block) {
+        if (output_size_x % block == 0)
+            output_block_width = block;
+    }
+    if (output_block_width == 0 && output_size_x < max_block_size * 3) {
+        size_t min_overhang = max_block_size;
+        for (size_t block = 4; block <= max_block_size; ++block) {
+            size_t overhang = block - output_size_x % block;
+            if (overhang <= min_overhang) {
+                min_overhang = overhang;
+                output_block_width = block;
+            }
+        }
+    }
+
+    if (output_block_width == 0) {
+        output_block_width = max_block_size;
+    }
+    return output_block_width;
+}
+
+static size_t get_ofm_per_wi(const size_t output_size_f) {
+    if (output_size_f % 32 == 0)
+        return 2;
+    return 1;
+}
+
+namespace kernel_selector {
+
+ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_3x3::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+
+    k.EnableInputWeightsType(WeightsType::INT8);
+
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+
+    k.EnableDifferentTypes();
+    k.EnableDifferentInputWeightsTypes();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBiasPerFeature();
+    k.EnableNonBiasTerm();
+    k.EnableBatching();
+    k.EnableQuantization(QuantizationType::SYMMETRIC);
+    k.DisableTuning();
+    return k;
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_3x3::GetKernelsData(const Params& params,
+                                                                   const optional_params& options) const {
+    return GetCommonKernelsData(params, options);
+}
+
+JitConstants Convolution_kernel_b_fs_yx_fsv16_imad_3x3::GetJitConstants(const convolution_params& params,
+                                                                     const DispatchData& kd) const {
+    auto mem_consts = Parent::GetJitConstants(params, kd);
+    const auto& output = params.output;
+
+    mem_consts.AddConstant(MakeJitConstant("OUT_BLOCK_WIDTH", getOutBlock_X(output.X().v, params.stride.x, params.filterSize.x)));
+    mem_consts.AddConstant(MakeJitConstant("OFM_BLOCKS_PER_SIMD", get_ofm_per_wi(output.Feature().v)));
+    mem_consts.AddConstant(MakeJitConstant("OFM_SIZE_PER_SIMD", SIMD_SIZE * get_ofm_per_wi(output.Feature().v)));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf_scalar = {"", {"out_b", "out_f + j * 16", "out_y", "out_x + i"}, "dequantized", input_dt, 1};
+        conf_scalar.SetLoopAxes({ Tensor::DataChannelName::X }, true);
+        mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
+    }
+
+    return mem_consts;
+}  // GetJitConstants
+
+ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad_3x3::SetDefault(const convolution_params& params,
+                                                                           int) const {
+    DispatchData kd;
+    const auto& output = params.output;
+    auto output_block_width = getOutBlock_X(output.X().v, params.stride.x, params.filterSize.x);
+    auto ofm_blocks_per_simd = get_ofm_per_wi(output.Feature().v);
+
+    kd.gws0 = CeilDiv(output.X().v, output_block_width);
+    kd.gws1 = output.Y().v;
+    kd.gws2 = output.Batch().v * Align(output.Feature().v / ofm_blocks_per_simd, SIMD_SIZE);
+
+    kd.lws0 = 1;
+    kd.lws1 = 1;
+    kd.lws2 = SIMD_SIZE;
+
+    kd.cldnnStyle = {0, 0, 0, 0, 0};
+    kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+
+    if (params.filterSize.x == 3)
+        kd.efficiency = FORCE_PRIORITY_2;
+    else
+        kd.efficiency = FORCE_PRIORITY_5;
+
+    return kd;
+}  // SetDefault
+
+bool Convolution_kernel_b_fs_yx_fsv16_imad_3x3::Validate(const Params& params, const optional_params& options) const {
+    if (!Parent::Validate(params, options)) {
+        return false;
+    }
+
+    KernelData kd = KernelData::Default<convolution_params>(params);
+    convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
+
+    if ((newParams.filterSize.x != newParams.filterSize.y) ||
+        (newParams.filterSize.x != 3 && newParams.filterSize.x != 5)) {
+        // Fitler size needs to be 3x3 or 5x5
+        return false;
+    }
+
+    if ((newParams.stride.x != newParams.stride.y) ||
+        (newParams.stride.x != 1 && newParams.stride.x != 2)) {
+        // Strides must be 1x1 or 2x2
+        return false;
+    }
+
+    if (newParams.groups != 1 || newParams.split != 1)
+        return false;
+
+    return true;
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3.h
new file mode 100644 (file)
index 0000000..e69a798
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+
+namespace kernel_selector {
+
+class Convolution_kernel_b_fs_yx_fsv16_imad_3x3 : public ConvolutionKernelBase {
+public:
+    using Parent = ConvolutionKernelBase;
+    Convolution_kernel_b_fs_yx_fsv16_imad_3x3() : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad_3x3") {}
+    virtual ~Convolution_kernel_b_fs_yx_fsv16_imad_3x3() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& params, const optional_params& options) const override;
+    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+    DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+    bool NeedPaddedInput() const override { return true; }
+    WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+        return WeightsLayout::os_is_yx_osv16_isv16;
+    }
+
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::ELTWISE,
+                 FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.cpp
new file mode 100644 (file)
index 0000000..e62348c
--- /dev/null
@@ -0,0 +1,158 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.h"
+#include "kernel_selector_utils.h"
+#include "common_tools.h"
+#include <vector>
+#include <iostream>
+
+//
+// Kernel specific constants
+//
+#define SIMD_SIZE 16
+
+static size_t getOutBlock_X(size_t output_size_x) {
+    auto output_block_width = 7;
+    if (output_size_x % 8 == 0)
+        output_block_width = 8;
+    return output_block_width;
+}
+
+
+namespace kernel_selector {
+
+ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+
+    k.EnableInputWeightsType(WeightsType::INT8);
+
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+
+    k.EnableDifferentTypes();
+    k.EnableDifferentInputWeightsTypes();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBiasPerFeature();
+    k.EnableNonBiasTerm();
+    k.EnableBatching();
+    k.EnableQuantization(QuantizationType::SYMMETRIC);
+    k.DisableTuning();
+    return k;
+}
+
+KernelsData Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::GetKernelsData(const Params& params,
+                                                                      const optional_params& options) const {
+    return GetCommonKernelsData(params, options);
+}
+
+JitConstants Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::GetJitConstants(const convolution_params& params,
+                                                                        const DispatchData& kd) const {
+    auto mem_consts = Parent::GetJitConstants(params, kd);
+    const auto& output = params.output;
+
+    mem_consts.AddConstants({MakeJitConstant("OUT_BLOCK_WIDTH", getOutBlock_X(output.X().v))});
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf_scalar = {"",
+                                             {"out_b", "(out_f + get_sub_group_id() * 16)", "out_y", "out_x + i"},
+                                             "dequantized",
+                                             input_dt,
+                                             1};
+        conf_scalar.SetLoopAxes({ Tensor::DataChannelName::X }, true);
+        mem_consts.Merge(MakeFusedOpsJitConstants(params, {conf_scalar}));
+    }
+
+    return mem_consts;
+}  // GetJitConstants
+
+ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::SetDefault(
+    const convolution_params& params,
+    int) const {
+    DispatchData kd;
+    const auto& output = params.output;
+
+    auto output_block_width = getOutBlock_X(output.X().v);
+    kd.gws0 = output.X().v / output_block_width;
+    kd.gws1 = output.Y().v;
+    kd.gws2 = output.Batch().v * output.Feature().v * 2;
+
+    kd.lws0 = 1;
+    kd.lws1 = 1;
+    kd.lws2 = SIMD_SIZE * 4;
+
+    kd.cldnnStyle = {0, 0, 0, 0, 0};
+    kd.gemmStyle = {0, 0, 0, 0, 0, 0};
+
+    kd.efficiency = FORCE_PRIORITY_1;
+
+    return kd;
+}  // SetDefault
+
+bool Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks::Validate(const Params& params, const optional_params& options) const {
+    if (!Parent::Validate(params, options)) {
+        return false;
+    }
+
+    KernelData kd = KernelData::Default<convolution_params>(params);
+    convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
+
+    if (newParams.output.Feature().v % (2 * SIMD_SIZE) != 0) {
+        return false;
+    }
+
+    if ((newParams.filterSize.x != newParams.filterSize.y) ||
+        newParams.filterSize.x != 3) {
+        // Fitler size needs to be 3x3
+        return false;
+    }
+
+    if ((newParams.stride.x != newParams.stride.y) ||
+        (newParams.stride.x != 1 && newParams.stride.x != 2)) {
+        // Strides must be 1x1 or 2x2
+        return false;
+    }
+
+    if (newParams.output.X().v % 8 != 0 && newParams.output.X().v % 7 != 0) {
+        return false;
+    }
+
+    if (CeilDiv(newParams.inputs[0].Feature().v, 16) % 4 != 0) {
+        return false;
+    }
+
+    const auto& output = newParams.output;
+    auto output_block_width = getOutBlock_X(output.X().v);
+    size_t eu_count = params.engineInfo.computeUnitsCount;
+    auto global_size =
+        (output.X().v / output_block_width) * output.Y().v * ((output.Batch().v * output.Feature().v));
+    if ((global_size / 16) > (eu_count * 7)) {
+        return false;
+    }
+
+    if (newParams.groups != 1 || newParams.split != 1)
+        return false;
+
+    return true;
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.h
new file mode 100644 (file)
index 0000000..d3dfeaf
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+
+namespace kernel_selector {
+
+class Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks : public ConvolutionKernelBase {
+public:
+    using Parent = ConvolutionKernelBase;
+    Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks() : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv16_imad_3x3_ks") {}
+    virtual ~Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& params, const optional_params& options) const override;
+    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+    DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+    bool NeedPaddedInput() const override { return true; }
+    WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+        return WeightsLayout::os_is_yx_osv16_isv16;
+    }
+
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::ELTWISE,
+                 FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.cpp
new file mode 100644 (file)
index 0000000..d15c965
--- /dev/null
@@ -0,0 +1,247 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
+
+#include <vector>
+#include <string>
+#include <algorithm>
+
+namespace kernel_selector {
+
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw()
+    : ConvolutionKernelBase("convolution_gpu_b_fs_yx_fsv_16_32_imad_dw") {
+    std::vector<size_t> simd_sizes = { 8, 16 };
+    std::vector<size_t> tile_x_sizes = { 1, 2, 3, 4, 5, 7, 8, 11, 16, 24, 32 };
+    std::vector<std::string> exe_modes = ConvolutionKernelBase::autoTuneOptions;
+
+    constexpr size_t max_block_size = 32 * 8;
+
+    for (auto simd : simd_sizes) {
+        for (size_t tile_x = 1; tile_x <= 32; ++tile_x) {
+            if (simd * tile_x > max_block_size)
+                continue;
+            for (auto exe_mode : exe_modes) {
+                all_tune_params.push_back(AutoTuneParams{ simd, tile_x, exe_mode });
+            }
+        }
+    }
+}
+
+ParamsKey kernel_selector::ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableInputWeightsType(WeightsType::INT8);
+    k.EnableInputWeightsType(WeightsType::UINT8);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv32);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv32);
+    k.EnableDifferentTypes();
+    k.EnableDifferentInputWeightsTypes();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBiasPerFeature();
+    k.EnableNonBiasTerm();
+    k.EnableBatching();
+    k.EnableQuantization(QuantizationType::SYMMETRIC);
+    k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
+    k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
+    k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
+    k.EnableDepthwiseSeparableOpt();
+    k.EnableGroupedConvolution();
+    return k;
+}
+
+bool ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::Validate(const Params& params, const optional_params& options) const {
+    if (!Parent::Validate(params, options))
+        return false;
+
+    auto conv_params = static_cast<const convolution_params&>(params);
+
+    if (conv_params.inputs[0].GetLayout() != conv_params.output.GetLayout())
+        return false;
+
+    if (conv_params.groups != conv_params.output.Feature().v || conv_params.groups != conv_params.inputs[0].Feature().v)
+        return false;
+
+    // Additional checks for asymmetric data
+    if (conv_params.quantization == QuantizationType::ASYMMETRIC_DATA ||
+        conv_params.quantization == QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS) {
+        // Needs compensation optimization
+        if (conv_params.compensation.empty())
+            return false;
+        // Padding not supported
+        const auto inputLimitX = (conv_params.output.X().v - 1) * conv_params.stride.x
+                               + (conv_params.filterSize.x - 1) * conv_params.dilation.x + 1;
+        const auto inputLimitY = (conv_params.output.Y().v - 1) * conv_params.stride.y
+                               + (conv_params.filterSize.y - 1) * conv_params.dilation.y + 1;
+        const auto inputLimitZ = (conv_params.output.Z().v - 1) * conv_params.stride.z
+                               + (conv_params.filterSize.z - 1) * conv_params.dilation.z + 1;
+
+        bool needs_pad = false;
+        needs_pad |= conv_params.padding.x != 0;
+        needs_pad |= conv_params.padding.y != 0;
+        needs_pad |= conv_params.padding.z != 0;
+        needs_pad |= inputLimitX > conv_params.output.X().v;
+        needs_pad |= inputLimitY > conv_params.output.Y().v;
+        needs_pad |= inputLimitZ > conv_params.output.Z().v;
+
+        if (needs_pad)
+            return false;
+    }
+
+    return true;
+}
+
+WeightsLayout ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetPreferredWeightsLayout(const convolution_params& params) const {
+    if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16)
+        return WeightsLayout::gs_oi_yxs_gsv16_yxsv4;
+    else
+        return WeightsLayout::gs_oi_yxs_gsv32_yxsv4;
+}
+
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::AutoTuneParams
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetAutoTuneParams(const convolution_params& params, int index) const {
+    if (index >= 0 && index < static_cast<int>(all_tune_params.size())) {
+        return all_tune_params[index];
+    }
+    AutoTuneParams tune_params;
+    tune_params.simd = 16;
+    if (params.output.GetLayout() == DataLayout::b_fs_yx_fsv16) {
+        tune_params.tile_x = std::min((size_t)16, params.output.X().v);
+    } else {
+        tune_params.tile_x = std::min((size_t)8, params.output.X().v);
+    }
+
+    if (params.output.X().v < 3 * tune_params.tile_x && params.output.X().v % tune_params.tile_x != 0) {
+        tune_params.tile_x = tune_params.tile_x / 2;
+    }
+
+    return tune_params;
+}
+
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::DispatchData
+ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::SetDefault(const convolution_params& params, int autoTuneIndex) const {
+    DispatchData kd;
+    auto& out = params.output;
+
+    auto tune_params = GetAutoTuneParams(params, autoTuneIndex);
+
+    size_t fsv = 1;
+    if (out.GetLayout() == DataLayout::b_fs_yx_fsv16) {
+        fsv = 16;
+    } else if (out.GetLayout() == DataLayout::b_fs_yx_fsv32) {
+        fsv = 32;
+    }
+
+    std::vector<size_t> global = {
+        CeilDiv(out.X().v, tune_params.tile_x),
+        out.Y().v,
+        CeilDiv(out.Feature().v, fsv) * tune_params.simd * out.Batch().v
+    };
+    std::vector<size_t> local = { 1, 1, tune_params.simd };
+
+    kd.gws0 = global[0];
+    kd.gws1 = global[1];
+    kd.gws2 = global[2];
+
+    kd.lws0 = local[0];
+    kd.lws1 = local[1];
+    kd.lws2 = local[2];
+
+    kd.gemmStyle = { 0, 0, 0, 0, 0, 0 };
+
+    kd.cldnnStyle.blockWidth = tune_params.tile_x;
+
+    kd.efficiency = params.stride.x == 1 ? FORCE_PRIORITY_1 : FORCE_PRIORITY_2;
+
+    return kd;
+}
+
+JitConstants ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetJitConstants(const convolution_params& params, const DispatchData& kd) const {
+    auto mem_consts = Parent::GetJitConstants(params, kd);
+
+    constexpr size_t imad_width = 4;
+    auto filter_spatial = params.weights.X().v * params.weights.Y().v;
+    auto filter_blocked = filter_spatial / imad_width * imad_width;
+
+    mem_consts.AddConstant(MakeJitConstant("LWS0", kd.lws0));
+    mem_consts.AddConstant(MakeJitConstant("LWS1", kd.lws1));
+    mem_consts.AddConstant(MakeJitConstant("SIMD", kd.lws2));
+
+    mem_consts.AddConstant(MakeJitConstant("TILE_X", kd.cldnnStyle.blockWidth));
+    mem_consts.AddConstant(MakeJitConstant("FILTER_BLOCKED", filter_blocked));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        auto conf_1 = FusedOpsConfiguration("_1",
+                                            { "b", "fused_ops_f", "y", "fused_ops_x" },
+                                            "fused_ops_in",
+                                            input_dt,
+                                            1,
+                                            LoadType::LT_ALIGNED_READ,
+                                            BoundaryCheck::ENABLED,
+                                            IndexType::TENSOR_COORD,
+                                            Tensor::DataChannelName::FEATURE);
+        auto conf_2 = conf_1;
+        conf_2.suffix = "_2";
+        conf_2.vec_size = 2;
+        auto conf_4 = conf_1;
+        conf_4.suffix = "_4";
+        conf_4.vec_size = 4;
+        mem_consts.Merge(MakeFusedOpsJitConstants(params, { conf_1, conf_2, conf_4 }));
+    }
+
+    return mem_consts;
+}
+
+KernelsData ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetTunedKernelsDataByIndex(const Params& params,
+                                                                                const optional_params& options,
+                                                                                int autoTuneIndex) const {
+    auto convParams = static_cast<const convolution_params&>(params);
+    auto tuneParams = GetAutoTuneParams(convParams, autoTuneIndex);
+    return GetCommonKernelsData(params, options, tuneParams.exeMode, autoTuneIndex);
+}
+
+KernelsData ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetKernelsData(const Params& params, const optional_params& options) const {
+    return GetTunedKernelsDataByIndex(params, options);
+}
+
+KernelsData ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw::GetKernelsDataForAutoTune(const Params& params,
+                                                                               const optional_params& options) const {
+    if (!Validate(params, options)) {
+        return {};
+    }
+    auto& conv_params = static_cast<const convolution_params&>(params);
+
+    KernelsData res = {};
+
+    for (size_t i = 0; i < all_tune_params.size(); i++) {
+        auto tune_params = GetAutoTuneParams(conv_params, static_cast<int>(i));
+        KernelsData kd = GetTunedKernelsDataByIndex(params, options, static_cast<int>(i));
+        if (!kd.empty()) {
+            res.emplace_back(kd[0]);
+        }
+    }
+
+    return res;
+}
+
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp
new file mode 100644 (file)
index 0000000..f3a2777
--- /dev/null
@@ -0,0 +1,56 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <vector>
+#include <string>
+
+namespace kernel_selector {
+class ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw : public ConvolutionKernelBase {
+public:
+    using Parent = ConvolutionKernelBase;
+    ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw();
+    virtual ~ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw() {}
+
+    ParamsKey GetSupportedKey() const override;
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    KernelsData GetKernelsDataForAutoTune(const Params & params, const optional_params & options) const override;
+    KernelsData GetTunedKernelsDataByIndex(const Params & params, const optional_params & options, int autoTuneIndex = -1) const override;
+
+protected:
+    bool Validate(const Params& params, const optional_params& options) const override;
+    WeightsLayout GetPreferredWeightsLayout(const convolution_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::ELTWISE,
+                 FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
+
+    bool NeedPaddedInput() const override { return true; }
+    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+    DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
+
+    struct AutoTuneParams {
+        size_t simd;
+        size_t tile_x;
+        std::string exeMode;
+    };
+    std::vector<AutoTuneParams> all_tune_params;
+
+    AutoTuneParams GetAutoTuneParams(const convolution_params& params, int index) const;
+};
+}  // namespace kernel_selector
index 32bf719..f23e282 100644 (file)
@@ -216,6 +216,9 @@ KernelsData ConvolutionKernelBase::GetCommonKernelsData(const Params& params,
 
     if (NeedPaddedInput()) {
         kd.reorderInput = CovolutionUpdateInputParams(newParams);
+
+        if (kd.reorderInput && !options.allowInputReordering)
+            return {};
     }
     DispatchData runInfo = SetDefault(newParams, autoTuneIndex);
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_iyxo.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_iyxo.cpp
new file mode 100644 (file)
index 0000000..94cb32e
--- /dev/null
@@ -0,0 +1,91 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "convolution_kernel_bfyx_iyxo.h"
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace kernel_selector {
+// Sub-group size used by "convolution_kernel_bfyx_iyxo" kernel.
+constexpr size_t sub_group_size = 16;
+
+ParamsKey ConvolutionKernel_bfyx_iyxo::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputWeightsType(WeightsType::F16);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableSubGroup();
+    k.EnableBiasPerFeature();
+    k.EnableNonBiasTerm();
+    k.EnableBatching();
+    return k;
+}
+
+ConvolutionKernelBase::DispatchData ConvolutionKernel_bfyx_iyxo::SetDefault(const convolution_params& cp, int) const {
+    DispatchData runInfo = ConvolutionKernelBase::SetDefault(cp);
+
+    runInfo.efficiency = FORCE_PRIORITY_9;
+
+    runInfo.gws0 = CeilDiv(cp.output.X().v, sub_group_size) / 4;
+    runInfo.gws1 = cp.output.Y().v;
+    runInfo.gws2 = sub_group_size;
+
+    runInfo.lws0 = 1;
+    runInfo.lws1 = 1;
+    runInfo.lws2 = sub_group_size;
+
+    return runInfo;
+}
+
+bool ConvolutionKernel_bfyx_iyxo::Validate(const Params& p, const optional_params& o) const {
+    if (!ConvolutionKernelBase::Validate(p, o) || !CovolutionCheckInput(p, o)) {
+        return false;
+    }
+
+    const auto& params = static_cast<const convolution_params&>(p);
+    if (params.inputs[0].X().v % 64)
+        return false;
+
+    bool bFilterSize = (params.filterSize.x == 5 && params.filterSize.y == 5) ||
+                       (params.filterSize.x == 3 && params.filterSize.y == 3 && (params.inputs[0].Feature().v % 4) == 0) ||
+                       (params.filterSize.x == 1 && params.filterSize.y == 1);
+
+    bool bStride = (params.stride.x == 1 && params.stride.y == 1);
+
+    if (!bFilterSize || !bStride || (params.output.Feature().v % 4) != 0 || (params.output.Batch().v != 1)) {
+        return false;
+    }
+
+    return true;
+}
+
+JitConstants ConvolutionKernel_bfyx_iyxo::GetJitConstants(const convolution_params& params, const DispatchData& runInfo) const {
+    auto jit = Parent::GetJitConstants(params, runInfo);
+
+    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
+
+    return jit;
+}
+
+KernelsData ConvolutionKernel_bfyx_iyxo::GetKernelsData(const Params& params, const optional_params& options) const {
+    return GetTunedKernelsDataByIndex(params, options);
+}
+
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_iyxo.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_bfyx_iyxo.h
new file mode 100644 (file)
index 0000000..ec82082
--- /dev/null
@@ -0,0 +1,42 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "convolution_kernel_base.h"
+#include <string>
+
+namespace kernel_selector {
+
+class ConvolutionKernel_bfyx_iyxo : public ConvolutionKernelBase {
+public:
+    using Parent = ConvolutionKernelBase;
+    ConvolutionKernel_bfyx_iyxo() : Parent("convolution_gpu_bfyx_iyxo") {}
+    virtual ~ConvolutionKernel_bfyx_iyxo() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    WeightsLayout GetPreferredWeightsLayout(const convolution_params&) const override {
+        return WeightsLayout::iyxo;
+    }
+
+    JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
+    bool Validate(const Params& p, const optional_params& o) const override;
+    bool NeedPaddedInput() const override { return true; }
+    DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override;
+};
+}  // namespace kernel_selector
index 4d8e7e3..20e2c9d 100644 (file)
 //
 #define SIMD_SIZE 16
 
-static bool getOutBlock_WH(size_t output_size,
+static void getOutBlock_WH(size_t output_size,
                            size_t stride,
                            size_t kernel_size,
+                           size_t dilation,
                            size_t& output_block_w,
                            size_t& output_block_h) {
-    bool verify_output_ranges = false;
-
     output_block_w = output_block_h = 0;
 
     size_t upper_border = output_size < SIMD_SIZE ? output_size : SIMD_SIZE;
 
-    size_t stride_restrictions = (SIMD_SIZE - (kernel_size - 1)) / stride;
+    size_t stride_restrictions = (SIMD_SIZE - (kernel_size - 1) * dilation - 1) / stride + 1;
 
     size_t max_posible_tile_size = upper_border < stride_restrictions ? upper_border : stride_restrictions;
 
@@ -46,7 +45,7 @@ static bool getOutBlock_WH(size_t output_size,
 
         size_t block_size = 0;
 
-        for (size_t i = min_horisontal_block_size; i < max_posible_tile_size; i++) {
+        for (size_t i = min_horisontal_block_size; i <= max_posible_tile_size; i++) {
             if (output_size % i == 0)
                 block_size = i;
         }
@@ -55,7 +54,6 @@ static bool getOutBlock_WH(size_t output_size,
             output_block_w = block_size;
         } else {
             output_block_w = max_posible_tile_size;
-            verify_output_ranges = true;
         }
     }
 
@@ -63,8 +61,6 @@ static bool getOutBlock_WH(size_t output_size,
         output_block_h = output_block_w;
     else
         output_block_h = 1;
-
-    return verify_output_ranges;
 }
 
 namespace kernel_selector {
@@ -73,20 +69,28 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::INT8);
     k.EnableInputDataType(Datatype::UINT8);
+
     k.EnableOutputDataType(Datatype::INT8);
     k.EnableOutputDataType(Datatype::UINT8);
     k.EnableOutputDataType(Datatype::F32);
+
     k.EnableInputWeightsType(WeightsType::INT8);
     k.EnableInputWeightsType(WeightsType::UINT8);
+
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
+
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv4);
     k.EnableOutputLayout(DataLayout::byxf_af32);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+
     k.EnableDifferentTypes();
     k.EnableDifferentInputWeightsTypes();
     k.EnableTensorOffset();
     k.EnableTensorPitches();
-//    k.EnableDilation();
+    k.EnableDilation();
     k.EnableBiasPerFeature();
+    k.EnableGroupedConvolution();
     k.EnableNonBiasTerm();
     k.EnableBatching();
     k.EnableQuantization(QuantizationType::SYMMETRIC);
@@ -103,33 +107,24 @@ JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& p
 
     const auto& input = params.inputs[0];
     const auto& output = params.output;
-
-    const auto& iDims = input.GetDims();
-    const auto& oDims = output.GetDims();
     const auto& weights = params.weights;
-    const auto& wDims = weights.GetDims();
-    const int iX = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::X);
-    const int iY = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::Y);
-    const int iF = DataTensor::Channelndex(input.GetLayout(), Tensor::DataChannelName::FEATURE);
-    const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
-    const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X);
-    const int oY = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::Y);
+
+    size_t in_fsv = 4;
+    if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4)
+        in_fsv = 4;
+    else if (params.inputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16)
+        in_fsv = 16;
+    else if (params.inputs[0].GetLayout() == DataLayout::byxf_af32)
+        in_fsv = 32;
 
     mem_consts.AddConstants({
-        MakeJitConstant("_IW", iDims[iX].v),
-        MakeJitConstant("_IH", iDims[iY].v),
-        MakeJitConstant("_ID", RoundUp(iDims[iF].v, 4)),
-        MakeJitConstant("IWPAD", iDims[iX].pad.before + iDims[iX].pad.after),
-        MakeJitConstant("IHPAD", iDims[iY].pad.before + iDims[iY].pad.after),
-        MakeJitConstant("_OW", oDims[oX].v),
-        MakeJitConstant("_OH", oDims[oY].v),
-        MakeJitConstant("_OD", wDims[wOD].v),
-        MakeJitConstant("OWPAD", oDims[oX].pad.before + oDims[oX].pad.after),
-        MakeJitConstant("OHPAD", oDims[oY].pad.before + oDims[oY].pad.after),
+        MakeJitConstant("_ID", RoundUp(input.Feature().v, in_fsv)),
+        MakeJitConstant("IWPAD", input.X().pad.Total()),
+        MakeJitConstant("IHPAD", input.Y().pad.Total()),
+        MakeJitConstant("_OD", Align(output.Feature().v, SIMD_SIZE)),
+        MakeJitConstant("OWPAD", output.X().pad.Total()),
+        MakeJitConstant("OHPAD", output.Y().pad.Total()),
         MakeJitConstant("SIMD_SIZE", SIMD_SIZE),
-        MakeJitConstant("K_HEIGHT", wDims[iY].v),
-        MakeJitConstant("K_WIDTH", wDims[iX].v),
-        MakeJitConstant("K_STRIDE", params.stride.x),  // X and Y must be equal
     });
 
     if (params.filterSize.x != 3 || params.filterSize.y != 3) {
@@ -139,10 +134,9 @@ JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& p
     mem_consts.Merge(MakeTypeJitConstants(GetPackedInputType(params), "PACKED"));
 
     size_t obw, obh;
-    bool verify_output_ranges = getOutBlock_WH(oDims[oX].v, params.stride.x, wDims[iX].v, obw, obh);
+    getOutBlock_WH(output.X().v, params.stride.x, weights.X().v, params.dilation.x, obw, obh);
     mem_consts.AddConstants({MakeJitConstant("OUT_BLOCK_WIDTH", obw),
-                             MakeJitConstant("OUT_BLOCK_HEIGHT", obh),
-                             MakeJitConstant("NEED_TO_VERIFY_OUTPUT_RANGES", verify_output_ranges)});
+                             MakeJitConstant("OUT_BLOCK_HEIGHT", obh)});
 
     if (!params.fused_ops.empty()) {
         auto input_dt = GetActivationType(params);
@@ -155,39 +149,23 @@ JitConstants ConvolutionKernel_imad::GetJitConstants(const convolution_params& p
 }  // GetJitConstants
 
 ConvolutionKernelBase::DispatchData ConvolutionKernel_imad::SetDefault(const convolution_params& params,
-                                                                           int) const {
+                                                                       int) const {
     DispatchData kd;
 
-    const auto& in = params.inputs[0];
     const auto& output = params.output;
     const auto& weights = params.weights;
-    const auto& iDims = in.GetDims();
-    const auto& oDims = output.GetDims();
-    const auto& wDims = weights.GetDims();
-    const int oX = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::X);
-    const int oY = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::Y);
-    const int oB = DataTensor::Channelndex(output.GetLayout(), Tensor::DataChannelName::BATCH);
-    const int wOD = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::OFM);
-    const int wX = WeightsTensor::Channelndex(weights.GetLayout(), Tensor::WeightsChannelName::X);
 
     size_t otw, oth;
-    getOutBlock_WH(oDims[oX].v, params.stride.x, wDims[wX].v, otw, oth);
+    getOutBlock_WH(output.X().v, params.stride.x, weights.X().v, params.dilation.x, otw, oth);
 
-    size_t dim_add = ((wDims[wOD].v * iDims[oB].v) % SIMD_SIZE);
-    if (dim_add != 0)
-        dim_add = SIMD_SIZE - dim_add;
+    std::vector<size_t> global = {// number of tiles needed to cover output width
+                                  CeilDiv(output.X().v, otw),
 
-    std::vector<size_t> global = {// globalRange[0] = ((_IW / K_STRIDE) + (OTW - 1)) / OTW;
-                                  // number of tiles needed to cover output width
-                                  CeilDiv(oDims[oX].v, otw),
-
-                                  // globalRange[1] = ((_IH / K_STRIDE) + (OTH - 1)) / OTH;
                                   // number of tiles needed to cover output height
-                                  CeilDiv(oDims[oY].v, oth),
+                                  CeilDiv(output.Y().v, oth),
 
-                                  // globalRange[2] = (_OD * _B) + ((_B *_OD) % __WORKGROUP_SIZE);
                                   // round depth range up
-                                  ((wDims[wOD].v * iDims[oB].v) + dim_add)};
+                                  Align(weights.OFM().v, SIMD_SIZE) * params.groups * output.Batch().v};
 
     std::vector<size_t> local = {1, 1, SIMD_SIZE};
 
@@ -205,7 +183,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_imad::SetDefault(const con
     // This kernel is quite slow for 1x1 and KHx1 kernels
     // TODO: check if we need any optimized kernels in this layout
     // If yes, we need to implement some customization for these cases.
-    kd.efficiency = FORCE_PRIORITY_2;
+    kd.efficiency = FORCE_PRIORITY_3;
 
     return kd;
 }  // SetDefault
@@ -215,17 +193,13 @@ bool ConvolutionKernel_imad::Validate(const Params& params, const optional_param
         return false;
     }
 
-    KernelData kd = KernelData::Default<convolution_params>(params);
-    convolution_params& newParams = *static_cast<convolution_params*>(kd.params.get());
-
-    if (newParams.stride.x != newParams.stride.y) {
-        // Strides must be equal
+    auto& newParams = static_cast<const convolution_params&>(params);
+    if ((newParams.inputs[0].Feature().v / newParams.groups) % 4 != 0)
         return false;
-    }
-    if (newParams.output.X().v != newParams.output.Y().v) {
-        // W and H must be equal
+
+    size_t min_block_size_x = (newParams.weights.X().v - 1) * newParams.dilation.x + 1;
+    if (min_block_size_x > SIMD_SIZE)
         return false;
-    }
 
     return true;
 }
index f3db5dd..37378c7 100644 (file)
@@ -35,8 +35,8 @@ protected:
     JitConstants GetJitConstants(const convolution_params& params, const DispatchData& kd) const override;
     DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
     bool NeedPaddedInput() const override { return true; }
-    WeightsLayout GetPreferredWeightsLayout(const convolution_params &) const override {
-        return WeightsLayout::os_is_yx_osv16_isv4;
+    WeightsLayout GetPreferredWeightsLayout(const convolution_params &p) const override {
+        return p.groups > 1 ? WeightsLayout::g_os_is_yx_osv16_isv4 : WeightsLayout::os_is_yx_osv16_isv4;
     }
 
     std::vector<FusedOpType> GetSupportedFusedOps() const override {
index 99dba36..4e069df 100644 (file)
@@ -21,6 +21,7 @@
 #include "convolution_kernel_bfyx_direct_10_12_16.h"
 #include "convolution_kernel_bfyx_os_iyx_osv16.h"
 #include "convolution_kernel_bfyx_os_iyx_osv16_2_sg.h"
+#include "convolution_kernel_bfyx_iyxo.h"
 #include "convolution_kernel_yxfb_ref.h"
 #include "convolution_kernel_yxfb_yxio_b16.h"
 #include "convolution_kernel_yxfb_yxio_b8.h"
 #include "convolution_kernel_mmad_b_fs_yx_fsv32_dw.h"
 #include "convolution_kernel_mmad_bfyx_b_fs_yx_fsv32.h"
 #include "convolution_kernel_bfyx_to_bs_fs_yx_bsv16_fsv16.h"
+#include "convolution_kernel_b_fs_yx_fsv16_imad_1x1.h"
+#include "convolution_kernel_b_fs_yx_fsv16_imad_3x3.h"
+#include "convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks.h"
+#include "convolution_kernel_b_fs_yx_fsv_16_32_imad_dw.hpp"
 
 namespace kernel_selector {
 convolution_kernel_selector::convolution_kernel_selector() {
     Attach<ConvolutionKernel_Ref>();
     Attach<DeformableConvolutionKernel_bfyx_Ref>();
 
+    // b_fs_yx_fsv16 int8
+    Attach<Convolution_kernel_b_fs_yx_fsv16_imad_1x1>();
+    Attach<Convolution_kernel_b_fs_yx_fsv16_imad_3x3>();
+    Attach<Convolution_kernel_b_fs_yx_fsv16_imad_3x3_ks>();
+
     // b_fs_yx_fsv16 and b_fs_zyx_fsv16
     Attach<ConvolutionKernel_b_fs_yx_fsv16_depthwise>();
     Attach<ConvolutionKernel_b_fs_yx_fsv16_1x1>();
@@ -97,6 +107,7 @@ convolution_kernel_selector::convolution_kernel_selector() {
     Attach<ConvolutionKernel_bfyx_GEMMLike>();
     Attach<ConvolutionKernel_bfyx_Direct_10_10_12>();
     Attach<ConvolutionKernel_bfyx_os_iyx_osv16>();
+    Attach<ConvolutionKernel_bfyx_iyxo>();
     Attach<ConvolutionKernel_bfyx_1x1>();
     Attach<ConvolutionKernel_bfyx_1x1_gemm_buf>();
     Attach<ConvolutionKernel_bfyx_depthwise_weights_lwg>();
@@ -145,6 +156,7 @@ convolution_kernel_selector::convolution_kernel_selector() {
     Attach<ConvolutionKernel_mmad_b_fs_yx_fsv32>();
     Attach<ConvolutionKernel_mmad_b_fs_yx_fsv32_dw>();
     Attach<ConvolutionKernel_mmad_bfyx_b_fs_yx_fsv32>();
+    Attach<ConvolutionKernel_b_fs_yx_fsv_16_32_imad_dw>();
 }
 
 KernelsData convolution_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.cpp
new file mode 100644 (file)
index 0000000..081ff45
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "depth_to_space_kernel_base.h"
+#include "kernel_selector_utils.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+
+bool DepthToSpaceKernelBase::Validate(const Params& p, const optional_params& o) const {
+    if (p.GetType() != KernelType::DEPTH_TO_SPACE ||
+        o.GetType() != KernelType::DEPTH_TO_SPACE) {
+        return false;
+    }
+
+    return true;
+}
+
+CommonDispatchData DepthToSpaceKernelBase::SetDefault(const depth_to_space_params& params) const {
+    CommonDispatchData runInfo;
+
+    std::vector<size_t> global = { params.output.Batch().v,
+                                   params.output.Feature().v,
+                                   params.output.Y().v * params.output.X().v };
+
+    auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+
+    runInfo.gws0 = global[0];
+    runInfo.gws1 = global[1];
+    runInfo.gws2 = global[2];
+
+    runInfo.lws0 = local[0];
+    runInfo.lws1 = local[1];
+    runInfo.lws2 = local[2];
+
+    return runInfo;
+}
+
+JitConstants DepthToSpaceKernelBase::GetJitConstants(const depth_to_space_params& params) const {
+    JitConstants jit = MakeBaseParamsJitConstants(params);
+
+    jit.AddConstant(MakeJitConstant("BLOCK_SIZE", params.block_size));
+
+    return jit;
+}
+
+KernelsData DepthToSpaceKernelBase::GetCommonKernelsData(const Params& params, const optional_params& options, float estimatedTime) const {
+    KernelData kd = KernelData::Default<depth_to_space_params>(params);
+    depth_to_space_params& newParams = *static_cast<depth_to_space_params*>(kd.params.get());
+
+    if (!Validate(params, options)) {
+        return {};
+    }
+
+    auto runInfo = SetDefault(newParams);
+    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
+    auto cldnn_jit = GetJitConstants(newParams);
+    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
+
+    auto& kernel = kd.kernels[0];
+
+    FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
+
+    kd.estimatedTime = estimatedTime;
+
+    return { kd };
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h
new file mode 100644 (file)
index 0000000..366938c
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "common_kernel_base.h"
+#include "kernel_selector_params.h"
+
+namespace kernel_selector {
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// depth_to_space_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct depth_to_space_params : public base_params {
+    depth_to_space_params() : base_params(KernelType::DEPTH_TO_SPACE), block_size(0) {}
+    size_t block_size;
+
+    virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// depth_to_space_optional_params
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+struct depth_to_space_optional_params : optional_params {
+    depth_to_space_optional_params() : optional_params(KernelType::DEPTH_TO_SPACE) {}
+};
+
+struct depth_to_space_fuse_params : fuse_params {
+    depth_to_space_fuse_params() : fuse_params(KernelType::DEPTH_TO_SPACE) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// DepthToSpaceKernelBase
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+class DepthToSpaceKernelBase : public common_kernel_base {
+public:
+    using common_kernel_base::common_kernel_base;
+    virtual ~DepthToSpaceKernelBase() {}
+
+    struct DispatchData : public CommonDispatchData {
+    };
+
+protected:
+    virtual bool Validate(const Params&, const optional_params&) const;
+    virtual JitConstants GetJitConstants(const depth_to_space_params& params) const;
+    virtual CommonDispatchData SetDefault(const depth_to_space_params& params) const;
+    KernelsData GetCommonKernelsData(const Params& params, const optional_params&, float estimatedTime) const;
+};
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_block2_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_block2_opt.cpp
new file mode 100644 (file)
index 0000000..e926af6
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "depth_to_space_kernel_block2_opt.h"
+#include "kernel_selector_utils.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+ParamsKey DepthToSpaceKernelBlock2Opt::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    return k;
+}
+
+bool DepthToSpaceKernelBlock2Opt::Validate(const Params& p, const optional_params& o) const {
+    if (!DepthToSpaceKernelBase::Validate(p, o))
+        return false;
+
+    const auto& params = static_cast<const depth_to_space_params&>(p);
+
+    if ((params.block_size != 2) || (params.inputs[0].X().v % 2 != 0))
+        return false;
+
+    return true;
+}
+
+CommonDispatchData DepthToSpaceKernelBlock2Opt::SetDefault(const depth_to_space_params& params) const {
+    CommonDispatchData runInfo;
+
+    std::vector<size_t> global = { Align(params.inputs[0].X().v / 2, 16),
+                                   params.inputs[0].Y().v,
+                                   1};
+
+    auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
+
+    runInfo.gws0 = global[0];
+    runInfo.gws1 = global[1];
+    runInfo.gws2 = global[2];
+
+    runInfo.lws0 = local[0];
+    runInfo.lws1 = local[1];
+    runInfo.lws2 = local[2];
+
+    return runInfo;
+}
+
+JitConstants DepthToSpaceKernelBlock2Opt::GetJitConstants(const depth_to_space_params& params) const {
+    JitConstants jit = Parent::GetJitConstants(params);
+
+    jit.AddConstant(MakeJitConstant("IN_WIDTH", params.inputs[0].X().v / 2));
+
+    return jit;
+}
+
+KernelsData DepthToSpaceKernelBlock2Opt::GetKernelsData(const Params& params, const optional_params& options) const {
+    return GetCommonKernelsData(params, options, FORCE_PRIORITY_5);
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_block2_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_block2_opt.h
new file mode 100644 (file)
index 0000000..721f49a
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "depth_to_space_kernel_base.h"
+
+namespace kernel_selector {
+class DepthToSpaceKernelBlock2Opt : public DepthToSpaceKernelBase {
+public:
+    using Parent = DepthToSpaceKernelBase;
+
+    DepthToSpaceKernelBlock2Opt() : DepthToSpaceKernelBase("depth_to_space_block2_opt") {}
+    virtual ~DepthToSpaceKernelBlock2Opt() {}
+
+    bool Validate(const Params&, const optional_params&) const override;
+    JitConstants GetJitConstants(const depth_to_space_params& params) const override;
+    CommonDispatchData SetDefault(const depth_to_space_params& params) const override;
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+};
+}  // namespace kernel_selector
index 6b67f9f..827b670 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -34,52 +34,7 @@ ParamsKey DepthToSpaceKernelRef::GetSupportedKey() const {
     return k;
 }
 
-CommonDispatchData DepthToSpaceKernelRef::SetDefault(const depth_to_space_params& params,
-                                                     const optional_params&) const {
-    CommonDispatchData runInfo;
-
-    std::vector<size_t> global = {params.output.Batch().v,
-                                  params.output.Feature().v,
-                                  params.output.Y().v * params.output.X().v};
-
-    auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
-
-    runInfo.gws0 = global[0];
-    runInfo.gws1 = global[1];
-    runInfo.gws2 = global[2];
-
-    runInfo.lws0 = local[0];
-    runInfo.lws1 = local[1];
-    runInfo.lws2 = local[2];
-
-    return runInfo;
-}
-
-JitConstants DepthToSpaceKernelRef::GetJitConstants(const depth_to_space_params& params) const {
-    JitConstants jit = MakeBaseParamsJitConstants(params);
-
-    jit.AddConstant(MakeJitConstant("BLOCK_SIZE", params.block_size));
-
-    return jit;
-}
-
 KernelsData DepthToSpaceKernelRef::GetKernelsData(const Params& params, const optional_params& options) const {
-    KernelData kd = KernelData::Default<depth_to_space_params>(params);
-    depth_to_space_params& newParams = *static_cast<depth_to_space_params*>(kd.params.get());
-
-    assert(params.GetType() == KernelType::DEPTH_TO_SPACE);
-
-    auto runInfo = SetDefault(newParams, options);
-    auto entry_point = GetEntryPoint(kernelName, newParams.layerID, options);
-    auto cldnn_jit = GetJitConstants(newParams);
-    std::string jit = CreateJit(kernelName, cldnn_jit, entry_point);
-
-    auto& kernel = kd.kernels[0];
-
-    FillCLKernelData(kernel, runInfo, params.engineInfo, kernelName, jit, entry_point);
-
-    kd.estimatedTime = DONT_USE_IF_HAVE_SOMETHING_ELSE;
-
-    return {kd};
+    return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
 }
 }  // namespace kernel_selector
index 50f314a..f74111f 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 
 #pragma once
 
-#include "common_kernel_base.h"
+#include "depth_to_space_kernel_base.h"
 
 namespace kernel_selector {
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// depth_to_space_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct depth_to_space_params : public base_params {
-    depth_to_space_params() : base_params(KernelType::DEPTH_TO_SPACE), block_size(0) {}
-
-    size_t block_size;
-
-    virtual ParamsKey GetParamsKey() const { return base_params::GetParamsKey(); }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// depth_to_space_optional_params
-////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-struct depth_to_space_optional_params : optional_params {
-    depth_to_space_optional_params() : optional_params(KernelType::DEPTH_TO_SPACE) {}
-};
-
-class DepthToSpaceKernelRef : public common_kernel_base {
+class DepthToSpaceKernelRef : public DepthToSpaceKernelBase {
 public:
-    DepthToSpaceKernelRef() : common_kernel_base("depth_to_space_ref") {}
+    DepthToSpaceKernelRef() : DepthToSpaceKernelBase("depth_to_space_ref") {}
     virtual ~DepthToSpaceKernelRef() {}
-    virtual JitConstants GetJitConstants(const depth_to_space_params& params) const;
-    virtual CommonDispatchData SetDefault(const depth_to_space_params& params, const optional_params&) const;
+
     KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
     ParamsKey GetSupportedKey() const override;
 };
index 67e444d..d6e4264 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 
 #include "depth_to_space_kernel_selector.h"
 #include "depth_to_space_kernel_ref.h"
+#include "depth_to_space_kernel_block2_opt.h"
 
 namespace kernel_selector {
 
-depth_to_space_kernel_selector::depth_to_space_kernel_selector() { Attach<DepthToSpaceKernelRef>(); }
+depth_to_space_kernel_selector::depth_to_space_kernel_selector() {
+    Attach<DepthToSpaceKernelRef>();
+    Attach<DepthToSpaceKernelBlock2Opt>();
+}
 
 KernelsData depth_to_space_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
     return GetNaiveBestKernel(params, options, KernelType::DEPTH_TO_SPACE);
index 0de10dd..c1bbf41 100644 (file)
@@ -156,7 +156,7 @@ bool EltwiseKernel_b_fs_yx_fsv16::Validate(const Params& params, const optional_
     for (size_t i = 0; i < ewParams.inputs.size(); i++) {
         // Allow the same input sizes OR per-channel operation
         if ((ewParams.inputs[i].LogicalSize() != output.LogicalSize()) &&
-            (ewParams.inputs[i].LogicalSize() != output.Feature().v) &&
+            (ewParams.inputs[i].LogicalSize() != output.Feature().v || ewParams.inputs[i].Feature().v != output.Feature().v) &&
             (ewParams.inputs[i].LogicalSize() != 1))
             return false;
     }
index 3885b30..948e832 100644 (file)
@@ -25,14 +25,20 @@ ParamsKey FullyConnectedKernelIMAD::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::INT8);
     k.EnableInputDataType(Datatype::UINT8);
+
     k.EnableOutputDataType(Datatype::INT8);
     k.EnableOutputDataType(Datatype::UINT8);
     k.EnableOutputDataType(Datatype::F32);
+
     k.EnableInputWeightsType(WeightsType::INT8);
-    k.EnableDifferentInputWeightsTypes();
-    k.EnableDifferentTypes();
+
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv4);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+
     k.EnableOutputLayout(DataLayout::bf);
+
+    k.EnableDifferentInputWeightsTypes();
+    k.EnableDifferentTypes();
     k.EnableBiasPerOutput();
     k.EnableBiasPerFeature();
     k.EnableNonBiasTerm();
@@ -101,11 +107,14 @@ JitConstants FullyConnectedKernelIMAD::GetJitConstants(const fully_connected_par
 }
 
 KernelsData FullyConnectedKernelIMAD::GetKernelsData(const Params& params, const optional_params& options) const {
+    auto fc_params = static_cast<const fully_connected_params&>(params);
+    auto& input = fc_params.inputs[0];
+
     KernelsData res = {};
     for (size_t i = 0; i < autoTuneOptions.size(); i++) {
         KernelsData kd = GetTunedKernelsDataByIndex(params,
                                                     options,
-                                                    DataLayout::b_fs_yx_fsv4,
+                                                    input.GetLayout(),
                                                     WeightsLayout::os_is_yx_osv16_isv4,
                                                     FORCE_PRIORITY_1,
                                                     static_cast<int>(i));
index d02dfbb..ac48606 100644 (file)
@@ -90,6 +90,10 @@ ParamsKey fused_conv_eltwise_params::GetParamsKey() const {
         k.EnableFusedConvEltwiseRWOutOpt();
     }
 
+    if (depth_to_space_already_fused) {
+        k.EnableFusedConvEltwDepthToSpaceFusing();
+    }
+
     return k;
 }
 
@@ -370,7 +374,11 @@ KernelsData fused_conv_eltwise_kernel_base::GetKernelsDataForAutoTune(const Para
 }
 
 static DataTensor GetConvolutionBFYXPaddedTensor(const fused_conv_eltwise_params& cp) {
-    DataTensor t = cp.inputs[0];
+    DataTensor t;
+    if (cp.inputs.size() > 1 && (cp.inputs[0].X().v <= cp.inputs[1].X().v))
+        t = cp.inputs[1];
+    else
+        t = cp.inputs[0];
     std::vector<Tensor::Pad> pad{{0, 0}, {0, 0}, {0, 0}, {0, 0}, { 0, 0 } };
 
     auto& conv = cp.conv;
index c24efee..43d3c81 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -64,6 +64,7 @@ struct fused_conv_eltwise_params : public weight_bias_params {
 
     float non_conv_scale = 1.0f;
     bool second_input_in_output = false;
+    bool depth_to_space_already_fused = false;
 
     std::string to_string() const override;
     std::string to_cache_string_v2() const override;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_iyxo.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_iyxo.cpp
new file mode 100644 (file)
index 0000000..6f91ce2
--- /dev/null
@@ -0,0 +1,96 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "fused_conv_eltwise_kernel_bfyx_iyxo.h"
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace kernel_selector {
+constexpr size_t sub_group_size = 16;
+
+fused_conv_eltwise_kernel_bfyx_iyxo::fused_conv_eltwise_kernel_bfyx_iyxo()
+    : fused_conv_eltwise_kernel_base("fused_conv_eltwise_gpu_bfyx_iyxo") {
+}
+
+ParamsKey fused_conv_eltwise_kernel_bfyx_iyxo::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableInputWeightsType(WeightsType::F16);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::image_2d_rgba);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableSubGroup();
+    k.EnableSubGroupShort();
+    k.EnableBiasPerFeature();
+    k.EnableBiasPerOutput();
+    k.EnableNonBiasTerm();
+    k.EnableBatching();
+    k.EnableDifferentTypes();
+    k.EnableFusedConvEltwSplitSupport();
+    k.EnableFusedConvEltwDilation();
+    k.EnableFusedConvEltwTranspose();
+    k.EnableFusedConvEltwiseRWOutOpt();
+    k.EnableFusedConvEltwDepthToSpaceFusing();
+    return k;
+}
+
+fused_conv_eltwise_kernel_base::DispatchData fused_conv_eltwise_kernel_bfyx_iyxo::SetDefault(
+    const fused_conv_eltwise_params& cp,
+    int) const {
+    DispatchData runInfo = fused_conv_eltwise_kernel_base::SetDefault(cp);
+
+    runInfo.efficiency = FORCE_PRIORITY_9;
+
+    runInfo.gws0 = CeilDiv(cp.output.X().v, sub_group_size) / 4 / 2;
+    runInfo.gws1 = cp.output.Y().v / 2;
+    runInfo.gws2 = sub_group_size;
+
+    runInfo.lws0 = 1;
+    runInfo.lws1 = 1;
+    runInfo.lws2 = sub_group_size;
+
+    return runInfo;
+}
+
+bool fused_conv_eltwise_kernel_bfyx_iyxo::Validate(const Params& p, const optional_params& o) const {
+    if (!fused_conv_eltwise_kernel_base::Validate(p, o) || !FusedConvolutionEltwiseCheckInput(p, o)) {
+        return false;
+    }
+
+    const auto& params = static_cast<const fused_conv_eltwise_params&>(p);
+    if (params.inputs[0].X().v % 128 || params.inputs[0].Y().v % 2)
+        return false;
+
+    return true;
+}
+
+JitConstants fused_conv_eltwise_kernel_bfyx_iyxo::GetJitConstants(const fused_conv_eltwise_params& params,
+                                                                  const DispatchData& runInfo) const {
+    auto jit = Parent::GetJitConstants(params, runInfo);
+    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", runInfo.lws2));
+    return jit;
+}
+
+KernelsData fused_conv_eltwise_kernel_bfyx_iyxo::GetKernelsData(const Params& params,
+                                                                        const optional_params& options) const {
+    return GetTunedKernelsDataByIndex(params, options);
+}
+
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_iyxo.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/fused_conv_eltwise/fused_conv_eltwise_kernel_bfyx_iyxo.h
new file mode 100644 (file)
index 0000000..965a863
--- /dev/null
@@ -0,0 +1,42 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "fused_conv_eltwise_kernel_base.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+
+class fused_conv_eltwise_kernel_bfyx_iyxo : public fused_conv_eltwise_kernel_base {
+public:
+    using Parent = fused_conv_eltwise_kernel_base;
+    fused_conv_eltwise_kernel_bfyx_iyxo();
+    virtual ~fused_conv_eltwise_kernel_bfyx_iyxo() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    WeightsLayout GetPreferreddWeightsLayout(const fused_conv_eltwise_params&) const override {
+        return WeightsLayout::iyxo;
+    }
+    JitConstants GetJitConstants(const fused_conv_eltwise_params& params, const DispatchData& kd) const override;
+    bool Validate(const Params& p, const optional_params& o) const override;
+    bool NeedPaddedInput() const override { return true; }
+    DispatchData SetDefault(const fused_conv_eltwise_params& arg, int autoTuneIndex = -1) const override;
+};
+}  // namespace kernel_selector
index 80e6057..fe1976b 100644 (file)
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2018 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "fused_conv_eltwise_kernel_yxfb_yxio_b16.h"
 #include "fused_conv_eltwise_kernel_imad.h"
 #include "fused_conv_eltwise_kernel_af32_imad_1x1.h"
+#include "fused_conv_eltwise_kernel_bfyx_iyxo.h"
 
 namespace kernel_selector {
 fused_conv_eltwise_kernel_selector::fused_conv_eltwise_kernel_selector() {
@@ -33,6 +34,7 @@ fused_conv_eltwise_kernel_selector::fused_conv_eltwise_kernel_selector() {
     Attach<fused_conv_eltwise_kernel_mmad_32x32sg_224x128wg_slm_int8>();
     Attach<fused_conv_eltwise_kernel_imad>();
     Attach<fused_conv_eltwise_kernel_af32_imad_1x1>();
+    Attach<fused_conv_eltwise_kernel_bfyx_iyxo>();
 }
 
 KernelsData fused_conv_eltwise_kernel_selector::GetBestKernels(const Params& params,
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.cpp
new file mode 100644 (file)
index 0000000..c7d1c6a
--- /dev/null
@@ -0,0 +1,315 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "mvn_kernel_b_fs_yx_fsv16_imad.hpp"
+#include "common/common_tools.h"
+
+#include <string>
+#include <algorithm>
+#include <iostream>
+
+namespace kernel_selector {
+
+static constexpr size_t simd = 16;
+static constexpr size_t fsv = 16;
+static constexpr size_t pref_work_groups = 16;
+
+ParamsKey MVNKernel_b_fs_yx_fsv16_imad::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::INT8);
+    k.EnableInputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableDifferentTypes();
+    k.EnableBatching();
+    // TODO Add support for across channels
+    // k.EnableMVNMode(MVNMode::ACROSS_CHANNELS);
+    k.EnableMVNMode(MVNMode::WITHIN_CHANNELS);
+    k.EnableMVNNormalizeVariance();
+    return k;
+}
+
+bool MVNKernel_b_fs_yx_fsv16_imad::Validate(const Params& p, const optional_params& options) const {
+    if (!Parent::Validate(p, options))
+        return false;
+
+    auto params = static_cast<const mvn_params&>(p);
+
+    // TODO Add support for input padding via iterating over y (parallel or in kernel).
+    if (params.inputs[0].X().pad.Total() != 0 || params.inputs[0].Y().pad.Total() != 0)
+        return false;
+
+    return true;
+}
+
+MVNKernelBase::DispatchData MVNKernel_b_fs_yx_fsv16_imad::SetDefault(const mvn_params& params) const {
+    auto kd = Parent::SetDefault(params);
+
+    auto items_num = params.output.X().v * params.output.Y().v;
+    auto max_wg = params.engineInfo.maxWorkGroupSize;
+    auto slm_per_sg = fsv * 4;
+    auto max_slm = params.engineInfo.maxLocalMemSize;
+    auto max_sgs = max_slm / slm_per_sg;
+
+    auto max_lws = std::min(max_wg, max_sgs * simd);
+
+    auto lws = std::max(std::min(items_num, max_lws) / simd, (size_t)1) * simd;
+
+    kd.gws0 = lws;
+    kd.gws1 = CeilDiv(params.output.Feature().v, fsv);
+    kd.gws2 = params.output.Batch().v;
+
+    kd.lws0 = lws;
+    kd.lws1 = 1;
+    kd.lws2 = 1;
+
+    kd.itemsNum = 1;
+
+    return kd;
+}
+
+JitConstants MVNKernel_b_fs_yx_fsv16_imad::GetJitConstants(const mvn_params& params, DispatchData kd) const {
+    auto jits = Parent::GetJitConstants(params, kd);
+
+    auto activation_dt = GetActivationType(params);
+    jits.Merge(MakeTypeJitConstants(activation_dt, "MEAN"));
+    jits.AddConstant(MakeJitConstant("SIMD", simd));
+    jits.AddConstant(MakeJitConstant("LWS", kd.lws0));
+    jits.AddConstant(MakeJitConstant("GWS", kd.gws0));
+    jits.AddConstant(MakeJitConstant("ITEM_GROUPS", kd.itemsNum));
+
+    if (!params.fused_ops.empty()) {
+        std::vector<std::string> idx_order;
+        idx_order = { "b", "(f + set_idx)", "(output_spatial / OUTPUT_SIZE_X)", "(output_spatial % OUTPUT_SIZE_X)" };
+        auto conf = FusedOpsConfiguration("", idx_order, "normalized", activation_dt);
+        jits.Merge(MakeFusedOpsJitConstants(params, { conf }));
+    }
+    return jits;
+}
+
+MVNKernel_b_fs_yx_fsv16_imad::MultiDispatchData MVNKernel_b_fs_yx_fsv16_imad::SetDefaultForMulti(const mvn_params& params) const {
+    MultiDispatchData md;
+
+    auto items_num = params.output.X().v * params.output.Y().v;
+    auto max_wg = params.engineInfo.maxWorkGroupSize;
+    auto slm_per_sg = fsv * 4;
+    auto max_slm = params.engineInfo.maxLocalMemSize;
+    auto max_sgs = max_slm / slm_per_sg;
+
+    auto max_lws = std::min(max_wg, max_sgs * simd);
+    auto lws = std::max(std::min(items_num, max_lws) / simd, (size_t)1) * simd;
+
+    // TODO Check if larger number of work-groups does not provide benefit
+    size_t item_groups = pref_work_groups;
+    md.item_groups = item_groups;
+
+    size_t stage1_lws = lws;
+
+    md.stage_1.gws0 = stage1_lws * item_groups;
+    md.stage_1.gws1 = CeilDiv(params.output.Feature().v, fsv);
+    md.stage_1.gws2 = params.output.Batch().v;
+
+    md.stage_1.lws0 = stage1_lws;
+    md.stage_1.lws1 = 1;
+    md.stage_1.lws2 = 1;
+
+    md.stage_1.itemsNum = item_groups;
+
+    size_t stage2_lws = std::max(std::min(item_groups, max_lws) / simd, (size_t)1) * simd;
+
+    md.stage_2.gws0 = stage2_lws;
+    md.stage_2.gws1 = CeilDiv(params.output.Feature().v, fsv);
+    md.stage_2.gws2 = params.output.Batch().v;
+
+    md.stage_2.lws0 = stage2_lws;
+    md.stage_2.lws1 = 1;
+    md.stage_2.lws2 = 1;
+
+    md.stage_2.itemsNum = item_groups;
+
+    md.stage_final.gws0 = std::max(items_num / simd, (size_t)1) * simd;
+    md.stage_final.gws1 = CeilDiv(params.output.Feature().v, fsv);
+    md.stage_final.gws2 = params.output.Batch().v;
+
+    md.stage_final.lws0 = simd;
+    md.stage_final.lws1 = 1;
+    md.stage_final.lws2 = 1;
+
+    md.stage_final.itemsNum = 1;
+
+    return md;
+}
+
+KernelsData MVNKernel_b_fs_yx_fsv16_imad::GetMultiStageKernelsData(const mvn_params& params, const optional_params& options, float estimated_time) const {
+    if (!Validate(params, options))
+        return {};
+
+    constexpr size_t intermidiate_bytes = 4;
+    const mvn_params& orgParams = static_cast<const mvn_params&>(params);
+
+    auto runInfo = SetDefaultForMulti(orgParams);
+
+    size_t kernels_num = params.mvnNormalizeVariance ? 5 : 3;
+    KernelData kd = KernelData::Default<mvn_params>(params, kernels_num);
+
+    auto finalKernelName = GetKernelName(orgParams);
+    {
+        // Mean first stage
+        auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_1);
+        cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_1", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[0];
+        FillCLKernelData(kernel,
+                         runInfo.stage_1,
+                         params.engineInfo,
+                         finalKernelName,
+                         jit,
+                         entry_point,
+                         "",
+                         false,
+                         false,
+                         0,
+                         0);
+        kernel.arguments.clear();  // Clear original output argument
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 0 });
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 0 });
+        kd.internalBufferSizes.push_back(
+            params.output.Batch().v * Align(params.output.Feature().v, fsv) * runInfo.item_groups * intermidiate_bytes);
+    }
+    {
+        // Mean second stage
+        auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_2);
+        cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MEAN_2", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[1];
+        FillCLKernelData(kernel,
+                         runInfo.stage_2,
+                         params.engineInfo,
+                         finalKernelName,
+                         jit,
+                         entry_point,
+                         "",
+                         false,
+                         false,
+                         0,
+                         0);
+        kernel.arguments.clear();  // Clear original output argument
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 0 });
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 1 });
+        kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) * intermidiate_bytes);
+    }
+    if (params.mvnNormalizeVariance) {
+        // Variance first stage
+        auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_1);
+        cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_1", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[2];
+        FillCLKernelData(kernel,
+                         runInfo.stage_1,
+                         params.engineInfo,
+                         finalKernelName,
+                         jit,
+                         entry_point,
+                         "",
+                         false,
+                         false,
+                         0,
+                         0);
+        kernel.arguments.clear();  // Clear original output argument
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INPUT, 0 });
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 1 });
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 0 });
+    }
+    if (params.mvnNormalizeVariance) {
+        // Variance second stage
+        auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_2);
+        cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_VAR_2", 1));
+        auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[3];
+        FillCLKernelData(kernel,
+                         runInfo.stage_2,
+                         params.engineInfo,
+                         finalKernelName,
+                         jit,
+                         entry_point,
+                         "",
+                         false,
+                         false,
+                         0,
+                         0);
+        kernel.arguments.clear();  // Clear original output argument
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 0 });
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 2 });
+        kd.internalBufferSizes.push_back(params.output.Batch().v * Align(params.output.Feature().v, fsv) * intermidiate_bytes);
+    }
+    {  // Final
+        auto cldnn_jit = GetJitConstants(orgParams, runInfo.stage_final);
+        cldnn_jit.AddConstant(MakeJitConstant("MVN_KERNEL_MAIN", 1));
+        cldnn_jit.AddConstant(MakeJitConstant("PRECALC_MEAN", 1));
+        cldnn_jit.AddConstant(MakeJitConstant("PRECALC_VARIANCE", params.mvnNormalizeVariance));
+        auto entry_point = GetEntryPoint(finalKernelName, orgParams.layerID, options);
+        auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point);
+        auto& kernel = kd.kernels[kernels_num - 1];
+        FillCLKernelData(kernel,
+                         runInfo.stage_final,
+                         params.engineInfo,
+                         finalKernelName,
+                         jit,
+                         entry_point,
+                         "",
+                         false,
+                         false,
+                         1,
+                         GetFusedPrimitiveInputsCount(params));
+        kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 1 });
+        if (params.mvnNormalizeVariance) {
+            kernel.arguments.push_back({ ArgumentDescriptor::Types::INTERNAL_BUFFER, 2 });
+        }
+    }
+    kd.intenralBufferDataType = Datatype::F32;
+    kd.estimatedTime = estimated_time;
+
+    return { kd };
+}
+
+
+KernelsData MVNKernel_b_fs_yx_fsv16_imad::GetKernelsData(const Params& params, const optional_params& optParams) const {
+    const mvn_params& orgParams = static_cast<const mvn_params&>(params);
+
+    auto max_slm = params.engineInfo.maxLocalMemSize;
+    auto slm_per_sg = fsv * 4;
+    auto max_lws = params.engineInfo.maxWorkGroupSize;
+    auto items_num = orgParams.output.X().v * orgParams.output.Y().v;
+
+    auto enough_slm = max_lws / simd * simd * slm_per_sg <= max_slm;
+    auto enough_lws = max_lws / simd >= 1;
+    auto enough_items = items_num >= max_lws / simd * simd * pref_work_groups;
+
+    if (enough_slm && enough_lws && enough_items)
+        return GetMultiStageKernelsData(orgParams, optParams, FORCE_PRIORITY_4);
+    else
+        return GetCommonKernelsData(params, optParams, FORCE_PRIORITY_4);
+}
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.hpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/mvn/mvn_kernel_b_fs_yx_fsv16_imad.hpp
new file mode 100644 (file)
index 0000000..38d9e99
--- /dev/null
@@ -0,0 +1,56 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "mvn_kernel_base.h"
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+class MVNKernel_b_fs_yx_fsv16_imad : public MVNKernelBase {
+public:
+    using Parent = MVNKernelBase;
+    MVNKernel_b_fs_yx_fsv16_imad() : MVNKernelBase("mvn_gpu_b_fs_yx_fsv16_imad") {}
+    virtual ~MVNKernel_b_fs_yx_fsv16_imad() {}
+
+    KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    struct MultiDispatchData {
+        DispatchData stage_1;
+        DispatchData stage_2;
+        DispatchData stage_final;
+
+        size_t item_groups;
+    };
+
+    bool Validate(const Params&, const optional_params&) const override;
+    DispatchData SetDefault(const mvn_params& params) const override;
+    JitConstants GetJitConstants(const mvn_params& params, DispatchData kd) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return {
+            FusedOpType::ACTIVATION,
+            FusedOpType::QUANTIZE,
+            FusedOpType::ELTWISE,
+            FusedOpType::SCALE
+        };
+    }
+
+    KernelsData GetMultiStageKernelsData(const mvn_params& params, const optional_params&, float estimated_time) const;
+    MultiDispatchData SetDefaultForMulti(const mvn_params& params) const;
+};
+}  // namespace kernel_selector
index 0733c33..7e5942f 100644 (file)
 #include "mvn_kernel_selector.h"
 #include "mvn_kernel_ref.h"
 #include "mvn_kernel_bfyx_opt.h"
+#include "mvn_kernel_b_fs_yx_fsv16_imad.hpp"
 
 namespace kernel_selector {
 mvn_kernel_selector::mvn_kernel_selector() {
     Attach<MVNKernelRef>();
     Attach<MVNKernelBfyxOpt>();
+    Attach<MVNKernel_b_fs_yx_fsv16_imad>();
 }
 
 KernelsData mvn_kernel_selector::GetBestKernels(const Params& params, const optional_params& options) const {
     return GetNaiveBestKernel(params, options, KernelType::MVN);
 }
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
index 7666925..205cd16 100644 (file)
@@ -53,6 +53,7 @@ inline uint32_t SubGroupSize(WeightsLayout l) {
         case WeightsLayout::g_os_is_zyx_isv16_osv16:
         case WeightsLayout::giy_xs_os_xsv2_osv16__ao32:
         case WeightsLayout::g_os_is_yx_isv16_osv16:
+        case WeightsLayout::os_is_yx_osv16_isv16:
             return 16;
         case WeightsLayout::os_i_osv8__ai8:
         case WeightsLayout::iy_xs_os_xsv2_osv8__ao32:
@@ -167,7 +168,18 @@ ReorderKernelBase::DispatchData ReorderKernelBase::SetDefault(const reorder_weig
 ReorderKernelBase::DispatchData ReorderKernelBase::SetDefault(const reorder_params& params) const {
     DispatchData kd;
 
-    auto global = GetTensorFriendlyWorkGroups(params.inputs[0]);
+    auto& input = params.inputs[0];
+    DataTensor input_tensor = input;
+    // Image formats reorders use read_image and write_image functions that operate on 4 channels at once, and support only single batch,
+    // make sure that reorder size is equal to spatials sizes only
+    if (params.inputs[0].GetLayout() == DataLayout::image_2d_rgba || params.output.GetLayout() == DataLayout::image_2d_rgba) {
+        std::vector<size_t> input_sizes(4, 1);
+        input_sizes[0] = input.X().v;
+        input_sizes[1] = input.Y().v;
+        input_tensor = DataTensor(input_sizes, input.GetDType(), DataLayout::image_2d_rgba);
+    }
+
+    auto global = GetTensorFriendlyWorkGroups(input_tensor);
     auto local = GetOptimalLocalWorkGroupSizes(global, params.engineInfo);
 
     kd.gws0 = global[0];
index f5f7ef2..c85b044 100644 (file)
 #include <core/common/kernel_selector_utils.h>
 #include "resample_kernel_ref.h"
 
+#include <algorithm>
+#include <vector>
+#include <string>
+
 namespace kernel_selector {
 
 ParamsKey ResampleKernelRef::GetSupportedKey() const {
@@ -43,9 +47,66 @@ KernelsData ResampleKernelRef::GetKernelsData(const Params& params, const option
     return GetCommonKernelsData(params, options);
 }
 
+static size_t packing_factor(const resample_params& params) {
+    // TODO Add support for only input packing
+    bool in_out_8bit = (params.inputs[0].GetDType() == Datatype::UINT8 || params.inputs[0].GetDType() == Datatype::INT8) &&
+                       (params.output.GetDType() == Datatype::UINT8 || params.output.GetDType() == Datatype::INT8);
+
+    if (!in_out_8bit)
+        return 1;
+
+    auto get_layout_packing_factor = [](const DataLayout& layout) -> size_t {
+        switch (layout) {
+        case DataLayout::b_fs_yx_fsv16:
+            return 16;
+        case DataLayout::b_fs_yx_fsv4:
+            return 4;
+        case DataLayout::byxf_af32:
+            return 16;
+        default:
+            break;
+        }
+        return 1;
+    };
+
+    size_t input_factor = get_layout_packing_factor(params.inputs[0].GetLayout());
+    size_t output_factor = get_layout_packing_factor(params.output.GetLayout());
+
+    return std::min(input_factor, output_factor);
+}
+
+static bool use_packing(const resample_params& params) {
+    if (params.resampleType != ResampleType::NEAREST_NEIGHBOR)
+        return false;
+
+    auto pack = packing_factor(params);
+    if (pack == 1)
+        return false;
+
+    if (params.inputs[0].Feature().v % pack != 0 || params.output.Feature().v % pack != 0 ||
+        params.inputs[0].Feature().pad.before % pack != 0 || params.output.Feature().pad.before % pack != 0)
+        return false;
+
+    auto packed_work_items = params.output.X().v * params.output.Y().v * params.output.Z().v
+        * CeilDiv(params.output.Feature().v, pack) * params.output.Batch().v;
+    // TODO Loosen this requirement to minimum EUs needed to saturate cache bandwidth
+    constexpr size_t max_work_items_per_eu = 32 * 7;
+    auto minimum_work_items = params.engineInfo.computeUnitsCount * max_work_items_per_eu;
+
+    if (packed_work_items < minimum_work_items)
+        return false;
+
+    return true;
+}
+
 JitConstants ResampleKernelRef::GetJitConstants(const resample_params& params) const {
     JitConstants jit = ResampleKernelBase::GetJitConstants(params);
 
+    if (use_packing(params)) {
+        jit.AddConstant(MakeJitConstant("PACK_SIZE", packing_factor(params)));
+        jit.AddConstant(MakeJitConstant("FEATURE_PACKED_MODE", "1"));
+    }
+
     if (!params.fused_ops.empty()) {
         std::vector<std::string> idx_order;
         if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) {
@@ -60,4 +121,27 @@ JitConstants ResampleKernelRef::GetJitConstants(const resample_params& params) c
 
     return jit;
 }
+
+ResampleKernelBase::DispatchData ResampleKernelRef::SetDefault(const resample_params& arg) const {
+    auto dispatch = Parent::SetDefault(arg);
+
+    if (use_packing(arg)) {
+        auto pack = packing_factor(arg);
+        std::vector<size_t> global;
+        std::vector<size_t> local;
+
+        global = { arg.output.X().v, arg.output.Y().v * arg.output.Z().v, CeilDiv(arg.output.Feature().v, pack) * arg.output.Batch().v };
+        local = GetOptimalLocalWorkGroupSizes(global, arg.engineInfo);
+
+        dispatch.gws0 = global[0];
+        dispatch.gws1 = global[1];
+        dispatch.gws2 = global[2];
+
+        dispatch.lws0 = local[0];
+        dispatch.lws1 = local[1];
+        dispatch.lws2 = local[2];
+    }
+
+    return dispatch;
+}
 }  // namespace kernel_selector
index 5ded77e..0963c29 100644 (file)
@@ -31,5 +31,8 @@ public:
                  FusedOpType::SCALE,
                  FusedOpType::ACTIVATION };
     }
+
+protected:
+    DispatchData SetDefault(const resample_params& arg) const override;
 };
 }  // namespace kernel_selector
index 1a72007..4f2284e 100644 (file)
@@ -56,7 +56,7 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint
 }
 
 
-KERNEL (concatenation_gpu_ref)(__global UNIT_TYPE* input, __global UNIT_TYPE* output, uint output_offset_in_concat_axis)
+KERNEL (concatenation_gpu_ref)(__global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, uint output_offset_in_concat_axis)
 {
     const uint x = (uint)get_global_id(0) % INPUT0_SIZE_X;
     const uint y = (uint)get_global_id(0) / INPUT0_SIZE_X;
@@ -91,5 +91,5 @@ KERNEL (concatenation_gpu_ref)(__global UNIT_TYPE* input, __global UNIT_TYPE* ou
     uint input_offset  = FUNC_CALL(get_input_index)(b, f, w, z, y, x);
     uint output_offset = FUNC_CALL(get_output_index)(out_b, out_f, out_w, out_z, out_y, out_x);
 
-    output[output_offset] = ACTIVATION(input[input_offset], ACTIVATION_PARAMS);
+    output[output_offset] = TO_OUTPUT_TYPE(ACTIVATION(input[input_offset], ACTIVATION_PARAMS));
 }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_1x1.cl
new file mode 100644 (file)
index 0000000..4cacde1
--- /dev/null
@@ -0,0 +1,232 @@
+// Copyright (c) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/mmad.cl"
+
+#if QUANTIZATION_TERM
+    #define ACCUMULATOR_TYPE int
+    #define TO_ACCUMULATOR_TYPE(x) convert_int(x)
+    #define ACTIVATION_TYPE float
+    #define TO_ACTIVATION_TYPE(x) convert_float(x)
+#else
+    #define ACCUMULATOR_TYPE INPUT0_TYPE
+    #define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
+    #define ACTIVATION_TYPE INPUT0_TYPE
+    #define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
+#endif
+
+#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+
+__attribute__((intel_reqd_sub_group_size(16)))
+KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_1x1)(
+    const __global INPUT0_TYPE   *conv_input,
+    __global OUTPUT_TYPE         *output,
+    const __global FILTER_TYPE    *weights,
+#if BIAS_TERM
+    const __global BIAS_TYPE     *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+    FUSED_OPS_DECLS,
+#endif
+    uint split_idx)
+{
+    #define LUT_VALUE_CLAMP(x) ((x) < (OUT_BLOCK_WIDTH - 1) * STRIDE_SIZE_X + 1 ? (x) : 0)
+    const int tmp[16] = {
+        LUT_VALUE_CLAMP(0),
+        LUT_VALUE_CLAMP(1),
+        LUT_VALUE_CLAMP(2),
+        LUT_VALUE_CLAMP(3),
+        LUT_VALUE_CLAMP(4),
+        LUT_VALUE_CLAMP(5),
+        LUT_VALUE_CLAMP(6),
+        LUT_VALUE_CLAMP(7),
+        LUT_VALUE_CLAMP(8),
+        LUT_VALUE_CLAMP(9),
+        LUT_VALUE_CLAMP(10),
+        LUT_VALUE_CLAMP(11),
+        LUT_VALUE_CLAMP(12),
+        LUT_VALUE_CLAMP(13),
+        LUT_VALUE_CLAMP(14),
+        LUT_VALUE_CLAMP(15)
+    };
+    #undef LUT_VALUE_CLAMP
+
+#if FEATURE_LWS_SPLIT != 1
+    const uint subgroup_id = get_sub_group_id();
+#else
+    const uint subgroup_id = 0;
+#endif
+    const uint subgroup_local_id = get_sub_group_local_id();
+
+    const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
+    const uint out_y = get_global_id(1);
+    const uint out_b = (uint)(get_group_id(2) * 32) / ALIGN(OUTPUT_FEATURE_NUM, 32);
+    const uint out_fg = (uint)(get_group_id(2) * 32) % ALIGN(OUTPUT_FEATURE_NUM, 32);
+    const uint out_f = out_fg + subgroup_local_id;
+
+    const uint feature_offset = subgroup_id * INPUT0_FEATURE_NUM / FEATURE_LWS_SPLIT;
+
+    ACCUMULATOR_TYPE dotProd[OUT_BLOCK_WIDTH * 2] = { 0 };
+
+    const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+    const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+    
+    uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, feature_offset, 0, 0);
+    uint filter_idx2 = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + 16, feature_offset, 0, 0);
+
+    __attribute__((opencl_unroll_hint(1)))
+    for(uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16)/FEATURE_LWS_SPLIT; k++ ) {
+        uint4 weights_val = vload4(0, (__global uint*)(weights + filter_idx));
+        uint4 weights_val2 = vload4(0, (__global uint *)(weights + filter_idx2));
+
+        uint input_idx = GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, out_b, feature_offset + k * 16, input_y, input_x + tmp[get_sub_group_local_id()]);
+        uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
+
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+            const uint ow_offset = ow + OUT_BLOCK_WIDTH;
+            dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X)), as_char4(weights_val.s0)));
+            dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X)), as_char4(weights_val.s1)));
+            dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X)), as_char4(weights_val.s2)));
+            dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X)), as_char4(weights_val.s3)));
+
+            dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s0)));
+            dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s1)));
+            dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s2)));
+            dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X)), as_char4(weights_val2.s3)));
+        }
+
+        filter_idx += 16 * 16;
+        filter_idx2 += 16 * 16;
+    }
+
+#if FEATURE_LWS_SPLIT != 1
+   __local ACCUMULATOR_TYPE partial_acc[16 * OUT_BLOCK_WIDTH * (FEATURE_LWS_SPLIT - 1) * 2];
+    if (subgroup_id == 0) {
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+            partial_acc[16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+        }
+    } else if (subgroup_id == 1) {
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+            partial_acc[i * 16 + subgroup_local_id] = dotProd[i];
+            dotProd[i] = dotProd[i + OUT_BLOCK_WIDTH];
+        }
+    } else if (subgroup_id == 2) {
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+            partial_acc[2 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
+            partial_acc[3 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+        }
+    } else if (subgroup_id == 3) {
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+            partial_acc[4 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
+            partial_acc[5 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (subgroup_id >= 2)
+        return;
+    __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+    for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+        dotProd[i] += partial_acc[(i + subgroup_id * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+        dotProd[i] += partial_acc[(i + (subgroup_id + 2) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+        dotProd[i] += partial_acc[(i + (subgroup_id + 4) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+    }
+#endif
+
+#if FEATURE_LWS_SPLIT == 1
+#   define OUTPUT_FEATURES_PER_WI 2
+#   if BIAS_TERM
+    BIAS_TYPE bias[OUTPUT_FEATURES_PER_WI] = { biases[out_f], biases[out_f + 16] };
+#   endif
+#else
+#   define OUTPUT_FEATURES_PER_WI 1
+#   if BIAS_TERM
+    BIAS_TYPE bias[OUTPUT_FEATURES_PER_WI] = { biases[out_f + subgroup_id * 16] };
+#   endif
+#endif
+
+    for (uint j = 0; j < OUTPUT_FEATURES_PER_WI; j++) {
+        uint out_f_offset = subgroup_id * 16 + j * 16;
+
+#if OUTPUT_FEATURE_NUM % 32 != 0 && OUTPUT_FEATURE_NUM % 32 <= 16
+        if (out_fg + 32 > OUTPUT_FEATURE_NUM && out_f_offset >= OUTPUT_FEATURE_NUM % 32)
+            break;
+#endif
+
+        const uint dst_index = GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, out_b, out_f + out_f_offset, out_y, out_x);
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+        FUSED_OPS_PRELOAD
+#endif
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+
+#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
+            if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && i >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
+                break;
+#endif
+            ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
+#if BIAS_TERM
+            dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i] + bias[j];
+#else
+            dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i];
+#endif
+            OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+            #if FUSED_OPS_CAN_USE_PRELOAD
+                FUSED_OPS_CALC
+            #else
+                FUSED_OPS
+            #endif
+            result = FUSED_OPS_RESULT;
+#else
+            result = TO_OUTPUT_TYPE(dequantized);
+#endif
+
+#if OUTPUT_FEATURE_NUM % 16 != 0
+            if (out_fg + out_f_offset + 16 > OUTPUT_FEATURE_NUM && subgroup_local_id >= OUTPUT_FEATURE_NUM % 16)
+                result = (OUTPUT_TYPE)0;
+#endif
+            output[dst_index + i * 16] = result;
+        }
+    }
+
+#undef OUTPUT_FEATURES_PER_WI
+}
+
+#undef AS_INPUT0_TYPE_4
+#undef AS_TYPE_N
+#undef AS_TYPE_N_
+#undef MAKE_VECTOR_TYPE
+#undef TO_ACTIVATION_TYPE
+#undef ACTIVATION_TYPE
+#undef TO_ACCUMULATOR_TYPE
+#undef ACCUMULATOR_TYPE
+
+#undef CEIL_DIV
+#undef ALIGN
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_3x3.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_3x3.cl
new file mode 100644 (file)
index 0000000..5915c84
--- /dev/null
@@ -0,0 +1,187 @@
+// Copyright (c) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/mmad.cl"
+
+#if QUANTIZATION_TERM
+#define ACCUMULATOR_TYPE int
+#define TO_ACCUMULATOR_TYPE(x) convert_int(x)
+#define ACTIVATION_TYPE float
+#define TO_ACTIVATION_TYPE(x) convert_float(x)
+#else
+#define ACCUMULATOR_TYPE INPUT0_TYPE
+#define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
+#define ACTIVATION_TYPE INPUT0_TYPE
+#define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
+#endif
+
+#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
+
+// int8 conv_input and weights data is packed to int32 "batches",
+// int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
+__attribute__((intel_reqd_sub_group_size(16)))
+__attribute__((reqd_work_group_size(1, 1, 16)))
+KERNEL(convolution_gpu_b_fs_yx_fsv16_imad_3x3)(
+    const __global INPUT0_TYPE *conv_input,
+    __global OUTPUT_TYPE *output,
+    const __global FILTER_TYPE *weights,
+#if BIAS_TERM
+    const __global BIAS_TYPE *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+    FUSED_OPS_DECLS,
+#endif
+    uint split_idx) {
+
+    #define LUT_VALUE_CLAMP(x) ((x) < (OUT_BLOCK_WIDTH - 1) * STRIDE_SIZE_X + FILTER_SIZE_X ? (x) : 0)
+    const int tmp[16] = {
+        LUT_VALUE_CLAMP(0),
+        LUT_VALUE_CLAMP(1),
+        LUT_VALUE_CLAMP(2),
+        LUT_VALUE_CLAMP(3),
+        LUT_VALUE_CLAMP(4),
+        LUT_VALUE_CLAMP(5),
+        LUT_VALUE_CLAMP(6),
+        LUT_VALUE_CLAMP(7),
+        LUT_VALUE_CLAMP(8),
+        LUT_VALUE_CLAMP(9),
+        LUT_VALUE_CLAMP(10),
+        LUT_VALUE_CLAMP(11),
+        LUT_VALUE_CLAMP(12),
+        LUT_VALUE_CLAMP(13),
+        LUT_VALUE_CLAMP(14),
+        LUT_VALUE_CLAMP(15)
+    };
+    #undef LUT_VALUE_CLAMP
+
+    const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
+    const uint out_y = get_global_id(1);
+    const uint out_b = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) / ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
+    const uint out_fg = (uint)(get_group_id(2) * OFM_SIZE_PER_SIMD) % ALIGN(OUTPUT_FEATURE_NUM, OFM_SIZE_PER_SIMD);
+    const uint out_f = out_fg + get_sub_group_local_id();
+    ACCUMULATOR_TYPE dotProd[OUT_BLOCK_WIDTH * OFM_BLOCKS_PER_SIMD] = {0};
+    const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+
+    const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+    uint filter_idx  = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, 0, 0, 0);
+#if OFM_BLOCKS_PER_SIMD == 2
+    uint filter_idx2 = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f + 16, 0, 0, 0);
+#endif
+
+    __attribute__((opencl_unroll_hint(1)))
+    for (uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16); k++) {
+        __attribute__((opencl_unroll_hint(1)))
+        for (uint j = 0; j < FILTER_SIZE_Y; j++) {
+            uint input_idx = GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, out_b, k * 16, input_y + j, input_x + tmp[get_sub_group_local_id()]);
+            uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
+
+            __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+            for (uint i = 0; i < FILTER_SIZE_X; i++) {
+
+                uint4 weights_val = vload4(0, (__global uint *)(weights + filter_idx));
+#if OFM_BLOCKS_PER_SIMD == 2
+                uint4 weights_val3 = vload4(0, (__global uint *)(weights + filter_idx2));
+#endif
+
+                __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+                for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+                    const uint ow_offset = ow + OUT_BLOCK_WIDTH;
+                    dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s0)));
+                    dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s1)));
+                    dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s2)));
+                    dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s3)));
+
+#if OFM_BLOCKS_PER_SIMD == 2
+                    dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X + i)),  as_char4(weights_val3.s0)));
+                    dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X + i)),  as_char4(weights_val3.s1)));
+                    dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X + i)),  as_char4(weights_val3.s2)));
+                    dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X + i)),  as_char4(weights_val3.s3)));
+#endif
+                }
+                filter_idx += 16 * 16;
+#if OFM_BLOCKS_PER_SIMD == 2
+                filter_idx2 += 16 * 16;
+#endif
+            }
+        } 
+    }
+
+#if BIAS_TERM
+    BIAS_TYPE bias[OFM_BLOCKS_PER_SIMD] = { biases[out_f]
+#if OFM_BLOCKS_PER_SIMD == 2
+        , biases[out_f + 16]
+#endif
+    };
+#endif
+    __attribute__((opencl_unroll_hint(OFM_BLOCKS_PER_SIMD)))
+    for (uint j = 0; j < OFM_BLOCKS_PER_SIMD; j++) {
+        const uint dst_index = GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, out_b, out_f + j * 16, out_y, out_x);
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+        FUSED_OPS_PRELOAD;
+#endif
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for (uint i = 0; i < OUT_BLOCK_WIDTH; i++) {
+
+#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
+            if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && i >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
+                break;
+#endif
+            ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
+#if BIAS_TERM
+            dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i] + bias[j];
+#else
+            dequantized = (ACTIVATION_TYPE)dotProd[OUT_BLOCK_WIDTH * j + i];
+#endif
+            OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+    #if FUSED_OPS_CAN_USE_PRELOAD
+            FUSED_OPS_CALC;
+    #else
+            FUSED_OPS;
+    #endif
+            result = FUSED_OPS_RESULT;
+#else
+            result = TO_OUTPUT_TYPE(dequantized);
+#endif
+
+#if OUTPUT_FEATURE_NUM % 16 != 0
+            if (out_fg + j * 16 + 16 > OUTPUT_FEATURE_NUM && get_sub_group_local_id() >= OUTPUT_FEATURE_NUM % 16)
+                result = (OUTPUT_TYPE)0;
+#endif
+            output[dst_index + i * 16] = result;
+        }
+    }
+}
+
+#undef AS_INPUT0_TYPE_4
+#undef AS_TYPE_N
+#undef AS_TYPE_N_
+#undef MAKE_VECTOR_TYPE
+#undef TO_ACTIVATION_TYPE
+#undef ACTIVATION_TYPE
+#undef TO_ACCUMULATOR_TYPE
+#undef ACCUMULATOR_TYPE
+
+#undef CEIL_DIV
+#undef ALIGN
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_3x3_ks.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv16_imad_3x3_ks.cl
new file mode 100644 (file)
index 0000000..df87ae0
--- /dev/null
@@ -0,0 +1,197 @@
+// Copyright (c) 2018-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/common.cl"
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/mmad.cl"
+
+#if QUANTIZATION_TERM
+    #define ACCUMULATOR_TYPE int
+    #define TO_ACCUMULATOR_TYPE(x) convert_int(x)
+    #define ACTIVATION_TYPE float
+    #define TO_ACTIVATION_TYPE(x) convert_float(x)
+#else
+    #define ACCUMULATOR_TYPE INPUT0_TYPE
+    #define TO_ACCUMULATOR_TYPE(x) TO_INPUT0_TYPE(x)
+    #define ACTIVATION_TYPE INPUT0_TYPE
+    #define TO_ACTIVATION_TYPE(x) TO_INPUT0_TYPE(x)
+#endif
+
+#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
+#define AS_TYPE_N_(type, n, x) as_##type##n(x)
+#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
+#define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1)/(b))
+
+__attribute__((intel_reqd_sub_group_size(16)))
+KERNEL(convolution_gpu_b_fs_yx_fsv16_3x3_ks)(
+    const __global INPUT0_TYPE   *conv_input,
+    __global OUTPUT_TYPE         *output,
+    const __global FILTER_TYPE    *weights,
+#if BIAS_TERM
+    const __global BIAS_TYPE     *biases,
+#endif
+#if HAS_FUSED_OPS_DECLS
+    FUSED_OPS_DECLS,
+#endif
+    uint split_idx)
+{
+#if OUT_BLOCK_WIDTH == 7 && STRIDE_SIZE_X == 1
+    const int tmp[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0};
+#elif OUT_BLOCK_WIDTH == 7 && STRIDE_SIZE_X == 2
+    const int tmp[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0};
+#elif OUT_BLOCK_WIDTH == 8 && STRIDE_SIZE_X == 1
+    const int tmp[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0};
+#else  // OUT_BLOCK_WIDTH == 8 && STRIDE_SIZE_X == 2
+    const int tmp[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+#endif
+
+    const uint out_x = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;
+    const uint out_y = get_global_id(1);
+    const uint out_f = (uint)(get_group_id(2) * 32 + get_sub_group_local_id());
+    const uint subgroup_id = get_sub_group_id();
+    const uint subgroup_local_id = get_sub_group_local_id();
+    const uint feature_offset = subgroup_id * INPUT0_FEATURE_NUM / 4;
+    const uint out_b = (uint)(get_group_id(2) * 32) / OUTPUT_FEATURE_NUM;
+
+    ACCUMULATOR_TYPE dotProd[OUT_BLOCK_WIDTH * 2] = { 0 };
+    const int input_x = out_x * STRIDE_SIZE_X - PADDING_SIZE_X;
+    const int input_y = out_y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+    uint filter_idx = GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(FILTER, out_f, feature_offset, 0, 0);
+    uint diff_filter_idx = 16*3*3*FILTER_IFM_NUM;
+
+    __attribute__((opencl_unroll_hint(1)))
+    for(uint k = 0; k < CEIL_DIV(INPUT0_FEATURE_NUM, 16)/4; k++ ) {
+        __attribute__((opencl_unroll_hint(1)))
+        for(uint j = 0; j < FILTER_SIZE_Y; j++) {
+            uint input_idx = GET_DATA_B_FS_YX_FSV16_INDEX(INPUT0, out_b, feature_offset + k * 16, input_y + j, input_x + tmp[subgroup_local_id]);
+            uint4 input_val0 = vload4(0, (__global uint *)(conv_input + input_idx));
+            
+             __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+            for(uint i = 0; i < FILTER_SIZE_X; i++) {
+
+                uint4 weights_val = vload4(0, (__global uint*)(weights + filter_idx));                
+                uint4 weights_val3 = vload4(0, (__global uint *)(weights + filter_idx + diff_filter_idx));
+
+                __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+                for(uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {
+                    const uint ow_offset = ow + OUT_BLOCK_WIDTH;
+                    dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s0)));
+                    dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s1)));
+                    dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s2)));
+                    dotProd[ow] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X + i)), as_char4(weights_val.s3)));
+
+                    dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s0, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s0)));
+                    dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s1, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s1)));
+                    dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s2, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s2)));
+                    dotProd[ow_offset] = TO_ACCUMULATOR_TYPE(IMAD(dotProd[ow_offset], AS_INPUT0_TYPE_4(intel_sub_group_shuffle(input_val0.s3, ow * STRIDE_SIZE_X + i)), as_char4(weights_val3.s3)));
+                }
+                filter_idx += 16 * 16;
+            }
+        }
+    }
+
+    //k slicing summing up with SLM
+    __local ACCUMULATOR_TYPE partial_acc[16 * OUT_BLOCK_WIDTH * 6];
+    if(subgroup_id == 0)
+    {
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for(uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+        {
+            partial_acc[16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+        }       
+    }
+    else if(subgroup_id == 1)
+    {
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for(uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+        {
+            partial_acc[i * 16 + subgroup_local_id] = dotProd[i];
+            dotProd[i] = dotProd[i + OUT_BLOCK_WIDTH];
+        }
+    }
+    else if (subgroup_id == 2)
+    {
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for (uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+        {
+            partial_acc[2 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
+            partial_acc[3 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+
+        }
+    }
+    else if (subgroup_id == 3)
+    {
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for (uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+        {
+            partial_acc[4 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i];
+            partial_acc[5 * 16 * OUT_BLOCK_WIDTH + i * 16 + subgroup_local_id] = dotProd[i + OUT_BLOCK_WIDTH];
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (subgroup_id < 2) {
+        __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+        for (uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+        {
+            dotProd[i] += partial_acc[(i + subgroup_id * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+            dotProd[i] += partial_acc[(i + (subgroup_id + 2) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+            dotProd[i] += partial_acc[(i + (subgroup_id + 4) * OUT_BLOCK_WIDTH) * 16 + subgroup_local_id];
+        }
+#if BIAS_TERM
+    BIAS_TYPE bias = biases[out_f + get_sub_group_id() * 16];
+#endif
+
+#if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
+    FUSED_OPS_PRELOAD;
+#endif
+    const uint dst_index = GET_DATA_B_FS_YX_FSV16_INDEX(OUTPUT, out_b, out_f + subgroup_id * 16, out_y, out_x);
+     __attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
+    for (uint i = 0; i < OUT_BLOCK_WIDTH; i++)
+    {
+        ACTIVATION_TYPE dequantized = (ACTIVATION_TYPE)0;
+#if BIAS_TERM
+        dequantized = (ACTIVATION_TYPE)dotProd[i] + bias;
+#else
+        dequantized = (ACTIVATION_TYPE)dotProd[i];
+#endif
+#if HAS_FUSED_OPS
+    #if FUSED_OPS_CAN_USE_PRELOAD
+        FUSED_OPS_CALC;
+    #else
+        FUSED_OPS;
+    #endif
+        output[dst_index + i * 16] = FUSED_OPS_RESULT;
+#else
+        output[dst_index + i * 16] = TO_OUTPUT_TYPE(dequantized);
+#endif
+    }
+    }
+}
+
+#undef AS_INPUT0_TYPE_4
+#undef AS_TYPE_N
+#undef AS_TYPE_N_
+#undef MAKE_VECTOR_TYPE
+#undef TO_ACTIVATION_TYPE
+#undef ACTIVATION_TYPE
+#undef TO_ACCUMULATOR_TYPE
+#undef ACCUMULATOR_TYPE
+
+#undef CEIL_DIV
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl
new file mode 100644 (file)
index 0000000..742e544
--- /dev/null
@@ -0,0 +1,700 @@
+/*
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "include/imad.cl"
+#include "include/data_types.cl"
+#include "include/fetch.cl"
+#include "include/mmad.cl"
+
+// ======================================================================================
+// Host side jit-constants:
+// ======================================================================================
+// SIMD   [{8, 16}] - Sub-group/simd size for the kernel. Used as third dimension of
+//                    local work size.
+// TILE_X [uint] - Number of output values along x dimension calculated by single
+//                 work-item/sub-group.
+// LWS0 [uint] - Local work size 0th dimension.
+// LWS1 [uint] - Local work size 1st dimension.
+// FILTER_BLOCKED - Number of filter spatial elements to process using IMAD. Must be less
+//                  or equal to total filter spatial size.
+//                  Currently only supported to be multiple of 4.
+// ======================================================================================
+// Supported operations:
+// input/output format: any b_fs_yx_fsv<k> - where <k> >= SIMD,
+//                      input and output formats must be the same
+// weights format:      os_i_yxs_oxv<k>_yxsv4 - where <k> same as in input format
+// input data types:   uchar8, char8
+// weights data types: uchar8, char8
+// output data types:  uchar8, char8, half, float
+// asymetric quantization: weights zero points, compensation term
+// ======================================================================================
+
+#if OUTPUT_LAYOUT_B_FS_YX_FSV16
+#   define FSV 16
+#elif OUTPUT_LAYOUT_B_FS_YX_FSV32
+#   define FSV 32
+#else
+#   error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - unsupported output layout.
+#endif
+
+#define F_PER_WI ((FSV) / (SIMD))
+
+#define DEQUANTIZED_TYPE float
+#define DEQUANTIZED_TYPE2 MAKE_VECTOR_TYPE(DEQUANTIZED_TYPE, 2)
+#define DEQUANTIZED_TYPE4 MAKE_VECTOR_TYPE(DEQUANTIZED_TYPE, 4)
+
+#define INPUT_TYPE        INPUT0_TYPE
+#define INPUT_TYPE2       MAKE_VECTOR_TYPE(INPUT0_TYPE, 2)
+#define INPUT_TYPE4       MAKE_VECTOR_TYPE(INPUT0_TYPE, 4)
+#define INPUT_TYPE8       MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
+#define INPUT_TYPE16      MAKE_VECTOR_TYPE(INPUT0_TYPE, 16)
+
+#define FILTER_TYPE4      MAKE_VECTOR_TYPE(FILTER_TYPE, 4)
+
+#define OUTPUT_TYPE2      MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2)
+#define OUTPUT_TYPE4      MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
+#define OUTPUT_TYPE8      MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8)
+#define OUTPUT_TYPE16     MAKE_VECTOR_TYPE(OUTPUT_TYPE, 16)
+
+#define AS_INPUT_TYPE(val)        CAT(as_, INPUT_TYPE)(val)
+#define AS_INPUT_TYPE2(val)       CAT(as_, INPUT_TYPE2)(val)
+#define AS_INPUT_TYPE4(val)       CAT(as_, INPUT_TYPE4)(val)
+#define AS_INPUT_TYPE8(val)       CAT(as_, INPUT_TYPE8)(val)
+#define AS_INPUT_TYPE16(val)      CAT(as_, INPUT_TYPE16)(val)
+
+#define AS_FILTER_TYPE4(val)      CAT(as_, FILTER_TYPE4)(val)
+
+#define TO_DEQUANTIZED_TYPE(val)  CAT(convert_, DEQUANTIZED_TYPE)(val)
+
+#define GET_INPUT_INDEX(b, f, y, x)    INPUT0_GET_INDEX(b, f, y, x)
+#if FSV == 16
+#   define GET_WEIGHTS_INDEX(g, o, i, y, x)  GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(FILTER, g, 0, 0, y, x)
+#else
+#   define GET_WEIGHTS_INDEX(g, o, i, y, x)  GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(FILTER, g, 0, 0, y, x)
+#endif
+#define GET_OUTPUT_INDEX(b, f, y, x)   OUTPUT_GET_INDEX(b, f, y, x)
+#define GET_BIAS_INDEX(b, f, y, x)     BIAS_GET_INDEX(b, f, y, x)
+
+#define INPUT_X_PITCH FSV
+#define INPUT_Y_PITCH (FSV * (INPUT0_SIZE_X + INPUT0_PAD_BEFORE_SIZE_X + INPUT0_PAD_AFTER_SIZE_X))
+
+#define WEIGHTS_YXS_PITCH (4 * FSV)
+
+#define FILTER_SPATIAL_SIZE (FILTER_SIZE_X * FILTER_SIZE_Y)
+
+#if OUTPUT_TYPE_SIZE == 1
+#   define OUTPUT_BLOCK_WRITE(ptr, val)    BLOCK_WRITE_UC_1((__global uchar*)(ptr), as_uchar(val));
+#   define OUTPUT_BLOCK_WRITE2(ptr, val)   BLOCK_WRITE_UC_2((__global uchar*)(ptr), as_uchar2(val));
+#   define OUTPUT_BLOCK_WRITE4(ptr, val)   BLOCK_WRITE_UC_4((__global uchar*)(ptr), as_uchar4(val));
+#   define OUTPUT_BLOCK_WRITE8(ptr, val)   BLOCK_WRITE_UC_8((__global uchar*)(ptr), as_uchar8(val));
+#   define OUTPUT_BLOCK_WRITE16(ptr, val)  BLOCK_WRITE_UC_16((__global uchar*)(ptr), as_uchar16(val));
+#elif OUTPUT_TYPE_SIZE == 2
+#   define OUTPUT_BLOCK_WRITE(ptr, val)    intel_sub_group_block_write_us((__global ushort*)(ptr), as_ushort(val));
+#   define OUTPUT_BLOCK_WRITE2(ptr, val)   intel_sub_group_block_write_us2((__global ushort*)(ptr), as_ushort2(val));
+#   define OUTPUT_BLOCK_WRITE4(ptr, val)   intel_sub_group_block_write_us4((__global ushort*)(ptr), as_ushort4(val));
+#   define OUTPUT_BLOCK_WRITE8(ptr, val)   intel_sub_group_block_write_us8((__global ushort*)(ptr), as_ushort8(val));
+#   define OUTPUT_BLOCK_WRITE16(ptr, val)                                               \
+    OUTPUT_BLOCK_WRITE8(ptr, (val).lo)                                                  \
+    OUTPUT_BLOCK_WRITE8((__global ushort*)(ptr) + 8 * get_max_sub_group_size(), (val).hi)
+#elif OUTPUT_TYPE_SIZE == 4
+#   define OUTPUT_BLOCK_WRITE(ptr, val)    intel_sub_group_block_write((__global uint*)(ptr), as_uint(val));
+#   define OUTPUT_BLOCK_WRITE2(ptr, val)   intel_sub_group_block_write2((__global uint*)(ptr), as_uint2(val));
+#   define OUTPUT_BLOCK_WRITE4(ptr, val)   intel_sub_group_block_write4((__global uint*)(ptr), as_uint4(val));
+#   define OUTPUT_BLOCK_WRITE8(ptr, val)   intel_sub_group_block_write8((__global uint*)(ptr), as_uint8(val));
+#   define OUTPUT_BLOCK_WRITE16(ptr, val)                                               \
+    OUTPUT_BLOCK_WRITE8(ptr, (val).lo)                                                  \
+    OUTPUT_BLOCK_WRITE8((__global uint*)(ptr) + 8 * get_max_sub_group_size(), (val).hi)
+#else
+#   error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - unsupported output type.
+#endif
+
+#define VEC_TO_ARRAY_2(arr, vec, offset)                \
+    (arr)[(offset) + 0] = (vec).s0;                     \
+    (arr)[(offset) + 1] = (vec).s1
+#define VEC_TO_ARRAY_4(arr, vec, offset)                \
+    VEC_TO_ARRAY_2(arr, (vec).s01, offset);             \
+    VEC_TO_ARRAY_2(arr, (vec).s23, (offset) + 2)
+#define VEC_TO_ARRAY_8(arr, vec, offset)                \
+    VEC_TO_ARRAY_4(arr, (vec).s0123, offset);           \
+    VEC_TO_ARRAY_4(arr, (vec).s4567, (offset) + 4)
+#define VEC_TO_ARRAY_16(arr, vec, offset)               \
+    VEC_TO_ARRAY_8(arr, (vec).s01234567, offset);       \
+    VEC_TO_ARRAY_8(arr, (vec).s89abcdef, (offset) + 8)
+
+#define ARRAY_TO_VEC_2(vec, arr, offset)                \
+    (vec).s0 = (arr)[(offset)];                         \
+    (vec).s1 = (arr)[(offset) + 1]
+
+#define ARRAY_TO_VEC_4(vec, arr, offset)                \
+    ARRAY_TO_VEC_2((vec).s01, arr, offset);             \
+    ARRAY_TO_VEC_2((vec).s23, arr, (offset) + 2)
+
+#define ARRAY_TO_VEC_8(vec, arr, offset)                \
+    ARRAY_TO_VEC_4((vec).s0123, arr, offset);           \
+    ARRAY_TO_VEC_4((vec).s4567, arr, (offset) + 4)
+
+#define ARRAY_TO_VEC_16(vec, arr, offset)               \
+    ARRAY_TO_VEC_8((vec).s01234567, arr, offset);       \
+    ARRAY_TO_VEC_8((vec).s89abcdef, arr, (offset) + 8)
+
+#if FILTER_BLOCKED % 4 != 0
+#   error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - FILTER_BLOCKED must be multiple of 4.
+#endif
+
+#ifndef OUTPUT_PAD_VALUE
+#   define OUTPUT_PAD_VALUE (OUTPUT_TYPE)(0)
+#   define OUTPUT_PAD_VALUE_undef
+#endif
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS0, LWS1, SIMD)))
+KERNEL(convolution)(
+    const __global  INPUT0_TYPE  *input,
+    __global        OUTPUT_TYPE  *output,
+    const __global  FILTER_TYPE  *weights,
+#if BIAS_TERM
+    const __global BIAS_TYPE     *biases,
+#endif
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+    const __global WEIGHTS_ZERO_POINTS_TYPE *weights_zp,
+#endif
+#if ASYMMETRIC_DATA_QUANTIZATION
+    const __global ACTIVATIONS_ZERO_POINTS_TYPE *activations_zp,
+#endif
+#if COMPENSATION_TERM
+    const __global COMPENSATION_TYPE *compensation,
+#endif
+#if HAS_FUSED_OPS_DECLS
+    FUSED_OPS_DECLS,
+#endif
+    uint split_idx
+) {
+    uint x = get_global_id(0) * TILE_X;
+    uint y = get_global_id(1);
+    uint bf = get_group_id(2);
+    uint b = bf % OUTPUT_BATCH_NUM;
+    uint f = bf / OUTPUT_BATCH_NUM * FSV;
+
+    uint input_offset = GET_INPUT_INDEX(b, f, (int)y * STRIDE_SIZE_Y - PADDING_SIZE_Y, (int)x * STRIDE_SIZE_X - PADDING_SIZE_X);
+    uint weights_offset = GET_WEIGHTS_INDEX(f, 0, 0, 0, 0);
+
+    int acc[TILE_X * F_PER_WI] = { };
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+    int src_sum[TILE_X * F_PER_WI] = { };
+#endif
+
+    __attribute__((opencl_unroll_hint))
+    for (uint fi = 0; fi < FILTER_BLOCKED / 4 * 4; fi += 4) {
+        // Loop over 4 filter spatials that match imad case
+        uint4 fis = (uint4)(fi, fi + 1, fi + 2, fi + 3);
+
+        uint4 fx = fis % FILTER_SIZE_X;
+        uint4 fy = fis / FILTER_SIZE_X;
+
+        // Input loading:
+        INPUT_TYPE in_trans0[TILE_X * F_PER_WI];
+        INPUT_TYPE in_trans1[TILE_X * F_PER_WI];
+        INPUT_TYPE in_trans2[TILE_X * F_PER_WI];
+        INPUT_TYPE in_trans3[TILE_X * F_PER_WI];
+#if STRIDE_SIZE_X == 1
+        // Without strides block reads can be used to load whole TILE_X inputs
+        // Block read ladder to select optimal combination of block reads for TILE_X
+        uint4 input_x_offset = fx * (DILATION_SIZE_X * INPUT_X_PITCH);
+        uint4 input_y_offset = fy * (DILATION_SIZE_Y * INPUT_Y_PITCH);
+        uint4 input_spatial_offset = input_x_offset + input_y_offset;
+        uint4 input_idx = input_spatial_offset + input_offset;
+
+        uint tx = 0;
+        __attribute__((opencl_unroll_hint))
+        for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
+            INPUT_TYPE16 tmp_in0 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx.s0)));
+            INPUT_TYPE16 tmp_in1 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx.s1)));
+            INPUT_TYPE16 tmp_in2 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx.s2)));
+            INPUT_TYPE16 tmp_in3 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx.s3)));
+
+            VEC_TO_ARRAY_16(in_trans0, tmp_in0, tx);
+            VEC_TO_ARRAY_16(in_trans1, tmp_in1, tx);
+            VEC_TO_ARRAY_16(in_trans2, tmp_in2, tx);
+            VEC_TO_ARRAY_16(in_trans3, tmp_in3, tx);
+
+            input_idx += 16 * SIMD;
+        }
+        if (TILE_X * F_PER_WI % 16 >= 8) {
+            INPUT_TYPE8 tmp_in0 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx.s0)));
+            INPUT_TYPE8 tmp_in1 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx.s1)));
+            INPUT_TYPE8 tmp_in2 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx.s2)));
+            INPUT_TYPE8 tmp_in3 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx.s3)));
+
+            VEC_TO_ARRAY_8(in_trans0, tmp_in0, tx);
+            VEC_TO_ARRAY_8(in_trans1, tmp_in1, tx);
+            VEC_TO_ARRAY_8(in_trans2, tmp_in2, tx);
+            VEC_TO_ARRAY_8(in_trans3, tmp_in3, tx);
+
+            input_idx += 8 * SIMD;
+            tx += 8;
+        }
+        if (TILE_X * F_PER_WI % 8 >= 4) {
+            INPUT_TYPE4 tmp_in0 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s0)));
+            INPUT_TYPE4 tmp_in1 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s1)));
+            INPUT_TYPE4 tmp_in2 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s2)));
+            INPUT_TYPE4 tmp_in3 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s3)));
+
+            VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx);
+            VEC_TO_ARRAY_4(in_trans1, tmp_in1, tx);
+            VEC_TO_ARRAY_4(in_trans2, tmp_in2, tx);
+            VEC_TO_ARRAY_4(in_trans3, tmp_in3, tx);
+
+            input_idx += 4 * SIMD;
+            tx += 4;
+        }
+        if (TILE_X * F_PER_WI % 4 >= 2) {
+            INPUT_TYPE2 tmp_in0 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s0)));
+            INPUT_TYPE2 tmp_in1 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s1)));
+            INPUT_TYPE2 tmp_in2 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s2)));
+            INPUT_TYPE2 tmp_in3 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s3)));
+
+            VEC_TO_ARRAY_2(in_trans0, tmp_in0, tx);
+            VEC_TO_ARRAY_2(in_trans1, tmp_in1, tx);
+            VEC_TO_ARRAY_2(in_trans2, tmp_in2, tx);
+            VEC_TO_ARRAY_2(in_trans3, tmp_in3, tx);
+
+            input_idx += 2 * SIMD;
+            tx += 2;
+        }
+        if (TILE_X * F_PER_WI % 2 == 1) {
+            in_trans0[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s0)));
+            in_trans1[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s1)));
+            in_trans2[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s2)));
+            in_trans3[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s3)));
+        }
+#else
+        uint4 input_x_offset = fx * DILATION_SIZE_X * INPUT_X_PITCH;
+        uint4 input_y_offset = fy * DILATION_SIZE_Y * INPUT_Y_PITCH;
+        uint4 input_spatial_offset = input_x_offset + input_y_offset;
+        uint4 input_start_offset = input_spatial_offset + input_offset;
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            uint4 input_idx = input_start_offset + tx * STRIDE_SIZE_X * INPUT_X_PITCH;
+            // Block reads along feature slice
+            uint fw = 0;
+            __attribute__((opencl_unroll_hint))
+            for (; fw + 4 <= F_PER_WI; fw += 4) {
+                INPUT_TYPE4 tmp_in0 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s0)));
+                INPUT_TYPE4 tmp_in1 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s1)));
+                INPUT_TYPE4 tmp_in2 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s2)));
+                INPUT_TYPE4 tmp_in3 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx.s3)));
+
+                VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx * F_PER_WI + fw);
+                VEC_TO_ARRAY_4(in_trans1, tmp_in1, tx * F_PER_WI + fw);
+                VEC_TO_ARRAY_4(in_trans2, tmp_in2, tx * F_PER_WI + fw);
+                VEC_TO_ARRAY_4(in_trans3, tmp_in3, tx * F_PER_WI + fw);
+
+                input_idx += 4 * SIMD;
+            }
+            if (F_PER_WI % 4 >= 2) {
+                INPUT_TYPE2 tmp_in0 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s0)));
+                INPUT_TYPE2 tmp_in1 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s1)));
+                INPUT_TYPE2 tmp_in2 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s2)));
+                INPUT_TYPE2 tmp_in3 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx.s3)));
+
+                VEC_TO_ARRAY_2(in_trans0, tmp_in0, tx * F_PER_WI + fw);
+                VEC_TO_ARRAY_2(in_trans1, tmp_in1, tx * F_PER_WI + fw);
+                VEC_TO_ARRAY_2(in_trans2, tmp_in2, tx * F_PER_WI + fw);
+                VEC_TO_ARRAY_2(in_trans3, tmp_in3, tx * F_PER_WI + fw);
+
+                input_idx += 2 * SIMD;
+                fw += 2;
+            }
+            if (F_PER_WI % 2 == 1) {
+                in_trans0[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s0)));
+                in_trans1[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s1)));
+                in_trans2[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s2)));
+                in_trans3[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx.s3)));
+            }
+        }
+#endif
+        // Weights loading:
+        FILTER_TYPE4 wei[F_PER_WI];
+        __attribute__((opencl_unroll_hint))
+        for (uint fw = 0; fw < F_PER_WI; ++fw) {
+            wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
+        }
+
+        // Transpose input:
+        INPUT_TYPE4 in[TILE_X * F_PER_WI];
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            __attribute__((opencl_unroll_hint))
+            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+                uint in_offset = tx * F_PER_WI + fw;
+                in[in_offset] = (INPUT_TYPE4)(in_trans0[in_offset], in_trans1[in_offset], in_trans2[in_offset], in_trans3[in_offset]);
+            }
+        }
+
+        // IMAD:
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            __attribute__((opencl_unroll_hint))
+            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+                acc[tx * F_PER_WI + fw] = IMAD(acc[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], wei[fw]);
+            }
+        }
+
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+        // Accumulate for input values for asymmetric weights:
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            __attribute__((opencl_unroll_hint))
+            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+                src_sum[tx * F_PER_WI + fw] = IMAD(src_sum[tx * F_PER_WI + fw], in[tx * F_PER_WI + fw], (char4)(1, 1, 1, 1));
+            }
+        }
+#endif
+
+        weights_offset += WEIGHTS_YXS_PITCH;
+    }
+
+
+#if FILTER_BLOCKED < FILTER_SPATIAL_SIZE
+    // Leftovers in filters spatial - use raw multiplication instead of imad
+    // Load inputs before loop to avoid byte scattered reads + there are at most 3 leftovers
+    FILTER_TYPE4 wei[F_PER_WI];
+    __attribute__((opencl_unroll_hint))
+    for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        wei[fw] = AS_FILTER_TYPE4(intel_sub_group_block_read((const __global uint*)(weights + weights_offset) + fw * SIMD));
+    }
+
+    __attribute__((opencl_unroll_hint))
+    for (uint fi = 0; fi < FILTER_SPATIAL_SIZE - FILTER_BLOCKED; ++fi) {
+        // Input loading:
+        uint fx = (fi + FILTER_BLOCKED) % FILTER_SIZE_X;
+        uint fy = (fi + FILTER_BLOCKED) / FILTER_SIZE_X;
+
+        INPUT_TYPE in_trans0[TILE_X * F_PER_WI];
+#   if STRIDE_SIZE_X == 1
+        uint input_x_offset = fx * (DILATION_SIZE_X * INPUT_X_PITCH);
+        uint input_y_offset = fy * (DILATION_SIZE_Y * INPUT_Y_PITCH);
+        uint input_spatial_offset = input_x_offset + input_y_offset;
+        uint input_idx = input_spatial_offset + input_offset;
+
+        uint tx = 0;
+        __attribute__((opencl_unroll_hint))
+        for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
+            INPUT_TYPE16 tmp_in0 = AS_INPUT_TYPE16(BLOCK_READ_UC_16((const __global uchar*)(input + input_idx)));
+            VEC_TO_ARRAY_16(in_trans0, tmp_in0, tx);
+            input_idx += 16 * SIMD;
+        }
+        if (TILE_X * F_PER_WI % 16 >= 8) {
+            INPUT_TYPE8 tmp_in0 = AS_INPUT_TYPE8(BLOCK_READ_UC_8((const __global uchar*)(input + input_idx)));
+            VEC_TO_ARRAY_8(in_trans0, tmp_in0, tx);
+            input_idx += 8 * SIMD;
+            tx += 8;
+        }
+        if (TILE_X * F_PER_WI % 8 >= 4) {
+            INPUT_TYPE4 tmp_in0 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx)));
+            VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx);
+            input_idx += 4 * SIMD;
+            tx += 4;
+        }
+        if (TILE_X * F_PER_WI % 4 >= 2) {
+            INPUT_TYPE2 tmp_in0 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx)));
+            VEC_TO_ARRAY_2(in_trans0, tmp_in0, tx);
+            input_idx += 2 * SIMD;
+            tx += 2;
+        }
+        if (TILE_X * F_PER_WI % 2 == 1) {
+            in_trans0[tx] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx)));
+        }
+#   else
+        uint input_x_offset = fx * DILATION_SIZE_X * INPUT_X_PITCH;
+        uint input_y_offset = fy * DILATION_SIZE_Y * INPUT_Y_PITCH;
+        uint input_spatial_offset = input_x_offset + input_y_offset;
+        uint input_start_offset = input_spatial_offset + input_offset;
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            uint input_idx = input_start_offset + tx * STRIDE_SIZE_X * INPUT_X_PITCH;
+            uint fw = 0;
+            __attribute__((opencl_unroll_hint))
+            for (; fw + 4 <= F_PER_WI; fw += 4) {
+                INPUT_TYPE4 tmp_in0 = AS_INPUT_TYPE4(BLOCK_READ_UC_4((const __global uchar*)(input + input_idx)));
+                VEC_TO_ARRAY_4(in_trans0, tmp_in0, tx * F_PER_WI + fw);
+                input_idx += 4 * SIMD;
+            }
+            if (F_PER_WI % 4 >= 2) {
+                INPUT_TYPE2 tmp_in0 = AS_INPUT_TYPE2(BLOCK_READ_UC_2((const __global uchar*)(input + input_idx)));
+                VEC_TO_ARRAY_2(in_trans0, tmp_in0, tx * F_PER_WI + fw);
+                input_idx += 2 * SIMD;
+                fw += 2;
+            }
+            if (F_PER_WI % 2 == 1) {
+                in_trans0[tx * F_PER_WI + fw] = AS_INPUT_TYPE(BLOCK_READ_UC_1((const __global uchar*)(input + input_idx)));
+            }
+        }
+#   endif
+        // Raw multiply accumulate:
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            __attribute__((opencl_unroll_hint))
+            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+                acc[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw] * (int)wei[fw][fi];
+            }
+        }
+
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+        // Accumulate input values for asymmetric weights:
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            __attribute__((opencl_unroll_hint))
+            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+                src_sum[tx * F_PER_WI + fw] += (int)in_trans0[tx * F_PER_WI + fw];
+            }
+        }
+#endif
+    }
+#endif
+
+    DEQUANTIZED_TYPE dequantized[TILE_X * F_PER_WI];
+    for (uint tx = 0; tx < TILE_X * F_PER_WI; ++tx) {
+        dequantized[tx] = TO_DEQUANTIZED_TYPE(acc[tx]);
+    }
+
+#if BIAS_TERM
+#   if BIAS_PER_OFM
+    __attribute__((opencl_unroll_hint))
+    for (uint fw = 0; fw < F_PER_WI; ++fw) {
+        uint bias_offset = f + fw * SIMD + get_sub_group_local_id();
+        BIAS_TYPE bias = biases[bias_offset];
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(bias);
+        }
+    }
+#   elif BIAS_PER_OUTPUT
+    __attribute__((opencl_unroll_hint))
+    for (uint tx = 0; tx < TILE_X; ++tx) {
+        __attribute__((opencl_unroll_hint))
+        for (uint fw = 0; fw < F_PER_WI; ++fw) {
+            uint bias_offset = GET_BIAS_INDEX(b, f + fw * SIMD + get_sub_group_local_id(), y, x + tx);
+            BIAS_TYPE bias = biases[bias_offset];
+            dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(bias);
+        }
+    }
+#   else
+#       error convolution_gpu_b_fs_yx_fsv_16_32_imad_dw.cl - unsupported bias mode.
+#   endif
+#endif
+
+#if ASYMMETRIC_WEIGHTS_QUANTIZATION
+    {
+        __attribute__((opencl_unroll_hint))
+        for (uint fw = 0; fw < F_PER_WI; ++fw) {
+            WEIGHTS_ZERO_POINTS_TYPE wzp = weights_zp[f + fw * SIMD + get_sub_group_local_id()];
+            __attribute__((opencl_unroll_hint))
+            for (uint tx = 0; tx < TILE_X; ++tx) {
+                dequantized[tx * F_PER_WI + fw] -= TO_DEQUANTIZED_TYPE(src_sum[tx * F_PER_WI + fw]) * TO_DEQUANTIZED_TYPE(wzp);
+            }
+        }
+    }
+#endif
+
+#if COMPENSATION_TERM
+    {
+        __attribute__((opencl_unroll_hint))
+        for (uint fw = 0; fw < F_PER_WI; ++fw) {
+            COMPENSATION_TYPE comp = compensation[f + fw * SIMD + get_sub_group_local_id()];
+            __attribute__((opencl_unroll_hint))
+            for (uint tx = 0; tx < TILE_X; ++tx) {
+                dequantized[tx * F_PER_WI + fw] += TO_DEQUANTIZED_TYPE(comp);
+            }
+        }
+    }
+#endif
+
+    OUTPUT_TYPE out[TILE_X * F_PER_WI];
+    // Fused ops and conversion to output type
+    __attribute__((opencl_unroll_hint))
+    for (uint tx = 0; tx < TILE_X; ++tx) {
+#if HAS_FUSED_OPS
+        uint fused_ops_x = x + tx;
+        uint fused_ops_f = f;
+        uint fw = 0;
+        __attribute__((opencl_unroll_hint))
+        for (; fw + 4 <= F_PER_WI; fw += 4) {
+            DEQUANTIZED_TYPE4 fused_ops_in;
+            ARRAY_TO_VEC_4(fused_ops_in, dequantized, tx * F_PER_WI + fw);
+            FUSED_OPS_4;
+            VEC_TO_ARRAY_4(out, FUSED_OPS_RESULT_4, tx * F_PER_WI + fw);
+            fused_ops_f += 4 * SIMD;
+        }
+        if (F_PER_WI % 4 >= 2) {
+            DEQUANTIZED_TYPE2 fused_ops_in;
+            ARRAY_TO_VEC_2(fused_ops_in, dequantized, tx * F_PER_WI + fw);
+            FUSED_OPS_2;
+            VEC_TO_ARRAY_2(out, FUSED_OPS_RESULT_2, tx * F_PER_WI + fw);
+            fw += 2;
+            fused_ops_f += 2 * SIMD;
+        }
+        if (F_PER_WI % 2 == 1) {
+            DEQUANTIZED_TYPE fused_ops_in;
+            fused_ops_in = dequantized[tx * F_PER_WI + fw];
+            FUSED_OPS_1;
+            out[tx * F_PER_WI + fw] = FUSED_OPS_RESULT_1;
+        }
+#else
+        __attribute__((opencl_unroll_hint))
+        for (uint fw = 0; fw < F_PER_WI; ++fw) {
+            out[tx * F_PER_WI + fw] = TO_OUTPUT_TYPE(dequantized[tx * F_PER_WI + fw]);
+        }
+#endif
+    }
+
+    // Fill results outside output in features with OUTPUT_PAD_VALUE.
+    if (OUTPUT_FEATURE_NUM % FSV != 0 && f + FSV > OUTPUT_FEATURE_NUM) {
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            __attribute__((opencl_unroll_hint))
+            for (uint fw = 0; fw < F_PER_WI; ++fw) {
+                bool outside = fw * SIMD + get_sub_group_local_id() >= OUTPUT_FEATURE_NUM % FSV;
+                out[tx * F_PER_WI + fw] = outside ? OUTPUT_PAD_VALUE : out[tx * F_PER_WI + fw];
+            }
+        }
+    }
+
+    uint output_offset = GET_OUTPUT_INDEX(b, f, y, x);
+
+    if (OUTPUT_SIZE_X % TILE_X == 0 || x + TILE_X <= OUTPUT_SIZE_X) {
+        // Full output tile x write using block write ladder
+        uint tx = 0;
+        __attribute__((opencl_unroll_hint))
+        for (; tx + 16 <= TILE_X * F_PER_WI; tx += 16) {
+            OUTPUT_TYPE16 tmp_write;
+            ARRAY_TO_VEC_16(tmp_write, out, tx);
+            OUTPUT_BLOCK_WRITE16(output + output_offset, tmp_write);
+            output_offset += 16 * SIMD;
+        }
+        if (TILE_X * F_PER_WI % 16 >= 8) {
+            OUTPUT_TYPE8 tmp_write;
+            ARRAY_TO_VEC_8(tmp_write, out, tx);
+            OUTPUT_BLOCK_WRITE8(output + output_offset, tmp_write);
+            tx += 8;
+            output_offset += 8 * SIMD;
+        }
+        if (TILE_X * F_PER_WI % 8 >= 4) {
+            OUTPUT_TYPE4 tmp_write;
+            ARRAY_TO_VEC_4(tmp_write, out, tx);
+            OUTPUT_BLOCK_WRITE4(output + output_offset, tmp_write);
+            tx += 4;
+            output_offset += 4 * SIMD;
+        }
+        if (TILE_X * F_PER_WI % 4 >= 2) {
+            OUTPUT_TYPE2 tmp_write;
+            ARRAY_TO_VEC_2(tmp_write, out, tx);
+            OUTPUT_BLOCK_WRITE2(output + output_offset, tmp_write);
+            tx += 2;
+            output_offset += 2 * SIMD;
+        }
+        if (TILE_X * F_PER_WI % 2 == 1) {
+            OUTPUT_BLOCK_WRITE(output + output_offset, out[tx]);
+        }
+    } else {
+        // Leftovers write, block writes in f dimension only
+        __attribute__((opencl_unroll_hint))
+        for (uint tx = 0; tx < TILE_X; ++tx) {
+            if (tx < OUTPUT_SIZE_X % TILE_X) {
+                uint fw = 0;
+                __attribute__((opencl_unroll_hint))
+                for (; fw + 4 <= F_PER_WI; fw += 4) {
+                    OUTPUT_TYPE4 tmp_write;
+                    ARRAY_TO_VEC_4(tmp_write, out, tx * F_PER_WI + fw);
+                    OUTPUT_BLOCK_WRITE4(output + output_offset + fw * SIMD, tmp_write);
+                }
+                if (F_PER_WI % 4 >= 2) {
+                    OUTPUT_TYPE2 tmp_write;
+                    ARRAY_TO_VEC_2(tmp_write, out, tx * F_PER_WI + fw);
+                    OUTPUT_BLOCK_WRITE2(output + output_offset + fw * SIMD, tmp_write);
+                    fw += 2;
+                }
+                if (F_PER_WI % 2 == 1) {
+                    OUTPUT_BLOCK_WRITE(output + output_offset + fw * SIMD, out[tx * F_PER_WI + fw]);
+                }
+            }
+            output_offset += FSV;
+        }
+    }
+}
+
+#undef FSV
+
+#undef F_PER_WI
+
+#undef DEQUANTIZED_TYPE
+#undef DEQUANTIZED_TYPE2
+#undef DEQUANTIZED_TYPE4
+
+#undef INPUT_TYPE
+#undef INPUT_TYPE2
+#undef INPUT_TYPE4
+#undef INPUT_TYPE8
+#undef INPUT_TYPE16
+
+#undef FILTER_TYPE4
+
+#undef OUTPUT_TYPE2
+#undef OUTPUT_TYPE4
+#undef OUTPUT_TYPE8
+#undef OUTPUT_TYPE16
+
+#undef AS_INPUT_TYPE
+#undef AS_INPUT_TYPE2
+#undef AS_INPUT_TYPE4
+#undef AS_INPUT_TYPE8
+#undef AS_INPUT_TYPE16
+
+#undef AS_FILTER_TYPE
+
+#undef TO_DEQUANTIZED_TYPE
+
+#undef GET_INPUT_INDEX
+#undef GET_WEIGHTS_INDEX
+#undef GET_OUTPUT_INDEX
+
+#undef INPUT_X_PITCH
+#undef INPUT_Y_PITCH
+
+#undef WEIGHTS_YXS_PITCH 
+
+#undef FILTER_SPATIAL_SIZE
+
+#undef OUTPUT_BLOCK_WRITE
+#undef OUTPUT_BLOCK_WRITE2
+#undef OUTPUT_BLOCK_WRITE4
+#undef OUTPUT_BLOCK_WRITE8
+#undef OUTPUT_BLOCK_WRITE16
+
+#undef VEC_TO_ARRAY_2
+#undef VEC_TO_ARRAY_4
+#undef VEC_TO_ARRAY_8
+#undef VEC_TO_ARRAY_16
+
+#undef ARRAY_TO_VEC_2
+#undef ARRAY_TO_VEC_4
+#undef ARRAY_TO_VEC_8
+#undef ARRAY_TO_VEC_16
+
+#ifdef OUTPUT_PAD_VALUE_undef
+#   undef OUTPUT_PAD_VALUE
+#   undef OUTPUT_PAD_VALUE_undef
+#endif
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_iyxo.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_iyxo.cl
new file mode 100644 (file)
index 0000000..a29ee09
--- /dev/null
@@ -0,0 +1,119 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/data_types.cl"
+#include "include/fetch.cl"
+
+__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
+KERNEL(convolution_gpu_bfyx_iyxo_5x5)(
+    const __global UNIT_TYPE* input,
+    __global UNIT_TYPE* output,
+    const __global UNIT_TYPE* weights,
+#if BIAS_TERM
+    const __global UNIT_TYPE* bias,
+#endif
+    uint split_idx)
+{
+    const uint idx = 4 * ((uint)get_global_id(0) * 16 + (uint)get_global_id(2));
+    const uint idy = (uint)get_global_id(1);
+    uint filter_idx = 0;
+    uint output_idx = 0;
+    uint input_idx = 0;
+    UNIT_TYPE inp[8] = { 0 };
+
+#if FILTER_OFM_NUM > 16
+#define FILTER_OFM_MAX 16
+#else
+#define FILTER_OFM_MAX FILTER_OFM_NUM
+#endif
+    __attribute__((opencl_unroll_hint(1)))
+        for (int iter = 0; iter < FILTER_OFM_NUM / FILTER_OFM_MAX + (FILTER_OFM_NUM % FILTER_OFM_MAX != 0); iter++) {
+            UNIT_TYPE out1[FILTER_OFM_MAX] = { 0 };
+            UNIT_TYPE out2[FILTER_OFM_MAX] = { 0 };
+            UNIT_TYPE out3[FILTER_OFM_MAX] = { 0 };
+            UNIT_TYPE out4[FILTER_OFM_MAX] = { 0 };
+
+            filter_idx = FILTER_OFM_MAX * iter;
+
+            __attribute__((opencl_unroll_hint(FILTER_IFM_NUM)))
+                for (int ifm = 0; ifm < FILTER_IFM_NUM; ifm++) {
+                    __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
+                        for (int yy = 0; yy < FILTER_SIZE_Y; yy++) {
+                            uint inp_idx = ifm * (INPUT0_FEATURE_PITCH)+(idy + yy) * (INPUT0_Y_PITCH)+idx;
+                            half8 tmp = as_half8(vload4(0, (__global uint*)(input + inp_idx)));
+
+                            inp[0] = tmp.s0;
+                            inp[1] = tmp.s1;
+                            inp[2] = tmp.s2;
+                            inp[3] = tmp.s3;
+                            inp[4] = tmp.s4;
+                            inp[5] = tmp.s5;
+                            inp[6] = tmp.s6;
+                            inp[7] = tmp.s7;
+
+                            __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+                                for (int xx = 0; xx < FILTER_SIZE_X; xx++) {
+#if FILTER_OFM_NUM == 4
+                                    half4 w = as_half4(vload2(0, (__global uint*)(weights + filter_idx)));
+#elif FILTER_OFM_NUM == 8
+                                    half8 w = as_half8(vload4(0, (__global uint*)(weights + filter_idx)));
+#else
+                                    half16 w = as_half16(vload8(0, (__global uint*)(weights + filter_idx)));
+#endif
+                                    __attribute__((opencl_unroll_hint(FILTER_OFM_MAX)))
+                                        for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm++) {
+                                            out1[ofm] = mad(inp[0 + xx], w[ofm], out1[ofm]);
+                                            out2[ofm] = mad(inp[1 + xx], w[ofm], out2[ofm]);
+                                            out3[ofm] = mad(inp[2 + xx], w[ofm], out3[ofm]);
+                                            out4[ofm] = mad(inp[3 + xx], w[ofm], out4[ofm]);
+                                        }
+                                    filter_idx += FILTER_OFM_NUM;
+                                }
+                        }
+                }
+
+            __attribute__((opencl_unroll_hint(FILTER_OFM_MAX)))
+                for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm++) {
+#if BIAS_TERM
+                    out1[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+                    out2[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+                    out3[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+                    out4[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+#endif
+                    out1[ofm] = ACTIVATION(out1[ofm], ACTIVATION_PARAMS);
+                    out2[ofm] = ACTIVATION(out2[ofm], ACTIVATION_PARAMS);
+                    out3[ofm] = ACTIVATION(out3[ofm], ACTIVATION_PARAMS);
+                    out4[ofm] = ACTIVATION(out4[ofm], ACTIVATION_PARAMS);
+                    output_idx = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + ofm * OUTPUT_FEATURE_PITCH +
+                        idy * OUTPUT_Y_PITCH + idx;
+#if OUTPUT_OFFSET > 0
+#if (OUTPUT_OFFSET % 2) > 0
+                    output[output_idx + OUTPUT_OFFSET + 0] = out1[ofm];
+                    output[output_idx + OUTPUT_OFFSET + 1] = out2[ofm];
+                    output[output_idx + OUTPUT_OFFSET + 2] = out3[ofm];
+                    output[output_idx + OUTPUT_OFFSET + 3] = out4[ofm];
+#else
+                    __global float* out_fl = output + output_idx + OUTPUT_OFFSET;
+                    out_fl[0] = as_float((half2)(out1[ofm], out2[ofm]));
+                    out_fl[1] = as_float((half2)(out3[ofm], out4[ofm]));
+#endif
+#else
+                    vstore2((float2)(as_float((half2)(out1[ofm], out2[ofm])), as_float((half2)(out3[ofm], out4[ofm]))),
+                        0, (__global float*)(output + output_idx));
+#endif
+                }
+        }
+}
index b48d8cd..db9b893 100644 (file)
@@ -99,8 +99,7 @@ KERNEL(convolution_gpu_bfyx_to_fs_byx_fsv32)(
         out[out_i] = UNIT_VAL_ZERO;
     }
 
-    uint input_offset = INPUT0_OFFSET_WITH_PADDING;
-    input_offset += oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X;
+    uint input_offset = oc * STRIDE_SIZE_X + INPUT0_PADDING_OFFSET_SIZE_X;
     input_offset += (or * STRIDE_SIZE_Y + INPUT0_PADDING_OFFSET_SIZE_Y) * INPUT0_SIZE_X_WITH_PADDING;
     input_offset += b * INPUT0_BATCH_PITCH;
 
index f912ad1..b679854 100644 (file)
@@ -73,7 +73,7 @@ KERNEL(convolution_mmad_b_fs_yx_fsv32_dw)(
                     in = input[input_idx];
 #if ASYMMETRIC_DATA_QUANTIZATION
                 else
-                    in = activations_zp[k];
+                    in = activations_zp[g*FILTER_IFM_NUM + k];
 #endif
 
                 uint filter_idx = filter_offset + k*FILTER_IFM_PITCH + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/depth_to_space_block2_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/depth_to_space_block2_opt.cl
new file mode 100644 (file)
index 0000000..9ce9302
--- /dev/null
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/include_all.cl"
+
+KERNEL(depth_to_space_block2_opt)(const __global half* input, __global half* output)
+{
+    const int in_height  = get_global_size(1);
+    const int2 pos = { get_global_id(0), get_global_id(1) };
+
+    if (pos.x >= (IN_WIDTH) || pos.y >= in_height) return;
+
+    const int offset = IN_WIDTH * in_height;
+
+    __attribute__((opencl_unroll_hint(OUTPUT_FEATURE_NUM)))
+    for (uint ofm_id=0; ofm_id < OUTPUT_FEATURE_NUM; ofm_id++){
+        int add_off = offset * 2 * ofm_id * BLOCK_SIZE * BLOCK_SIZE;
+        int ofm_x_offset = offset * ofm_id;
+        const int inIdx = IN_WIDTH * pos.y + pos.x + ofm_x_offset;
+
+        half2 conv_out_0 = ACTIVATION(vload2(inIdx+(offset * 0 * OUTPUT_FEATURE_NUM), input ), ACTIVATION_PARAMS);
+        half2 conv_out_1 = ACTIVATION(vload2(inIdx+(offset * 1 * OUTPUT_FEATURE_NUM), input ), ACTIVATION_PARAMS);
+        half2 conv_out_2 = ACTIVATION(vload2(inIdx+(offset * 2 * OUTPUT_FEATURE_NUM), input ), ACTIVATION_PARAMS);
+        half2 conv_out_3 = ACTIVATION(vload2(inIdx+(offset * 3 * OUTPUT_FEATURE_NUM), input ), ACTIVATION_PARAMS);
+        
+        int outIdx1 = IN_WIDTH * BLOCK_SIZE * pos.y + pos.x;
+        int outIdx2 = outIdx1 + IN_WIDTH;
+
+        vstore2((float2)(as_float((half2)(conv_out_0.s0, conv_out_1.s0)), as_float((half2)(conv_out_0.s1, conv_out_1.s1))), outIdx1, (__global float*) (output + add_off));
+        vstore2((float2)(as_float((half2)(conv_out_2.s0, conv_out_3.s0)), as_float((half2)(conv_out_2.s1, conv_out_3.s1))), outIdx2, (__global float*) (output + add_off));
+    }
+}
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_iyxo.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/fused_conv_eltwise_gpu_bfyx_iyxo.cl
new file mode 100644 (file)
index 0000000..f45cc3f
--- /dev/null
@@ -0,0 +1,199 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/data_types.cl"
+#include "include/fetch.cl"
+
+__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+__attribute__((reqd_work_group_size(1, 1, SUB_GROUP_SIZE)))
+KERNEL(fused_conv_eltwise_gpu_bfyx_iyxo)(
+    const __global UNIT_TYPE* input,
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+    write_only image2d_t output,
+#else
+    __global UNIT_TYPE* output,
+#endif
+    const __global UNIT_TYPE* weights,
+#if BIAS_TERM
+    const __global UNIT_TYPE* bias,
+#endif
+    uint split_idx,
+    const __global UNIT_TYPE* eltw_input)
+{
+    const uint idx = 4 * ((uint)get_global_id(0) * 16 + (uint)get_global_id(2));
+    const uint idy = (uint)get_global_id(1);
+    uint filter_idx = 0;
+    uint output_idx = 0;
+    uint output_idx_eltwise = 0;
+    uint input_idx = 0;
+    UNIT_TYPE inp[8] = { 0 };
+    const uint input0_pitch_Y = INPUT0_SIZE_X + 2 * (INPUT0_PAD_BEFORE_SIZE_X);
+    const uint input0_pitch_feature = input0_pitch_Y * (INPUT0_SIZE_Y + 2 * (INPUT0_PAD_BEFORE_SIZE_Y));
+
+#if FILTER_OFM_NUM > 16
+#define FILTER_OFM_MAX 16
+#else
+#define FILTER_OFM_MAX FILTER_OFM_NUM
+#endif
+    __attribute__((opencl_unroll_hint(1)))
+        for (int iter = 0; iter < FILTER_OFM_NUM / FILTER_OFM_MAX + (FILTER_OFM_NUM % FILTER_OFM_MAX != 0); iter++) {
+            UNIT_TYPE out1[FILTER_OFM_MAX] = { 0 };
+            UNIT_TYPE out2[FILTER_OFM_MAX] = { 0 };
+            UNIT_TYPE out3[FILTER_OFM_MAX] = { 0 };
+            UNIT_TYPE out4[FILTER_OFM_MAX] = { 0 };
+
+            filter_idx = FILTER_OFM_MAX * iter;
+
+            __attribute__((opencl_unroll_hint(FILTER_IFM_NUM)))
+                for (int ifm = 0; ifm < FILTER_IFM_NUM; ifm++) {
+                    __attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
+                        for (int yy = 0; yy < FILTER_SIZE_Y; yy++) {
+                            uint inp_idx = ifm * input0_pitch_feature + (idy + yy) * input0_pitch_Y + idx;
+                            half8 tmp = as_half8(vload4(0, (__global uint*)(input + inp_idx)));
+
+                            inp[0] = tmp.s0;
+                            inp[1] = tmp.s1;
+                            inp[2] = tmp.s2;
+                            inp[3] = tmp.s3;
+                            inp[4] = tmp.s4;
+                            inp[5] = tmp.s5;
+                            inp[6] = tmp.s6;
+                            inp[7] = tmp.s7;
+
+                            __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+                                for (int xx = 0; xx < FILTER_SIZE_X; xx++) {
+#if FILTER_OFM_NUM == 4
+                                    half4 w = as_half4(vload2(0, (__global uint*)(weights + filter_idx)));
+#elif FILTER_OFM_NUM == 8
+                                    half8 w = as_half8(vload4(0, (__global uint*)(weights + filter_idx)));
+#else
+                                    half16 w = as_half16(vload8(0, (__global uint*)(weights + filter_idx)));
+#endif
+                                    __attribute__((opencl_unroll_hint(FILTER_OFM_MAX)))
+                                        for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm++) {
+                                            out1[ofm] = mad(inp[0 + xx], w[ofm], out1[ofm]);
+                                            out2[ofm] = mad(inp[1 + xx], w[ofm], out2[ofm]);
+                                            out3[ofm] = mad(inp[2 + xx], w[ofm], out3[ofm]);
+                                            out4[ofm] = mad(inp[3 + xx], w[ofm], out4[ofm]);
+                                        }
+                                    filter_idx += FILTER_OFM_NUM;
+                                }
+                        }
+                }
+
+            __attribute__((opencl_unroll_hint(FILTER_OFM_MAX)))
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+                for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm+=3) {
+#else
+                for (int ofm = 0; ofm < FILTER_OFM_MAX; ofm++) {
+#endif
+#if BIAS_TERM
+                    out1[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+                    out2[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+                    out3[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+                    out4[ofm] += bias[(iter * FILTER_OFM_MAX) + ofm];
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+                    out1[ofm + 1] += bias[(iter * FILTER_OFM_MAX) + ofm + 1];
+                    out2[ofm + 1] += bias[(iter * FILTER_OFM_MAX) + ofm + 1];
+                    out3[ofm + 1] += bias[(iter * FILTER_OFM_MAX) + ofm + 1];
+                    out4[ofm + 1] += bias[(iter * FILTER_OFM_MAX) + ofm + 1];
+
+                    out1[ofm + 2] += bias[(iter * FILTER_OFM_MAX) + ofm + 2];
+                    out2[ofm + 2] += bias[(iter * FILTER_OFM_MAX) + ofm + 2];
+                    out3[ofm + 2] += bias[(iter * FILTER_OFM_MAX) + ofm + 2];
+                    out4[ofm + 2] += bias[(iter * FILTER_OFM_MAX) + ofm + 2];
+#endif
+#endif
+                    out1[ofm] = ACTIVATION(out1[ofm], ACTIVATION_PARAMS);
+                    out2[ofm] = ACTIVATION(out2[ofm], ACTIVATION_PARAMS);
+                    out3[ofm] = ACTIVATION(out3[ofm], ACTIVATION_PARAMS);
+                    out4[ofm] = ACTIVATION(out4[ofm], ACTIVATION_PARAMS);
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+                    out1[ofm + 1] = ACTIVATION(out1[ofm + 1], ACTIVATION_PARAMS);
+                    out2[ofm + 1] = ACTIVATION(out2[ofm + 1], ACTIVATION_PARAMS);
+                    out3[ofm + 1] = ACTIVATION(out3[ofm + 1], ACTIVATION_PARAMS);
+                    out4[ofm + 1] = ACTIVATION(out4[ofm + 1], ACTIVATION_PARAMS);
+
+                    out1[ofm + 2] = ACTIVATION(out1[ofm + 2], ACTIVATION_PARAMS);
+                    out2[ofm + 2] = ACTIVATION(out2[ofm + 2], ACTIVATION_PARAMS);
+                    out3[ofm + 2] = ACTIVATION(out3[ofm + 2], ACTIVATION_PARAMS);
+                    out4[ofm + 2] = ACTIVATION(out4[ofm + 2], ACTIVATION_PARAMS);
+#endif
+                    uint ofm_alignment = 4;
+                    int idx_for_image = 0;
+                    int idy_for_image = 0;
+
+                    if (ofm / OUTPUT_FEATURE_NUM == 0) {
+                        output_idx_eltwise = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + (ofm % OUTPUT_FEATURE_NUM) * OUTPUT_FEATURE_PITCH +
+                            2 * idy * OUTPUT_Y_PITCH + 2 * idx;
+                        output_idx = (ofm % OUTPUT_FEATURE_NUM) + 2 * idy * OUTPUT_SIZE_X * ofm_alignment + 2 * idx * ofm_alignment;
+                        idx_for_image = 2 * idx;
+                        idy_for_image = 2 * idy;
+                    }
+                    else if (ofm / OUTPUT_FEATURE_NUM == 1) {
+                        output_idx_eltwise = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + (ofm % OUTPUT_FEATURE_NUM) * OUTPUT_FEATURE_PITCH +
+                            2 * idy * OUTPUT_Y_PITCH + 2 * idx + 1;
+                        output_idx = (ofm % OUTPUT_FEATURE_NUM) + 2 * idy * OUTPUT_SIZE_X * ofm_alignment + (2 * idx + 1) * ofm_alignment;
+                        idx_for_image = 2 * idx + 1;
+                        idy_for_image = 2 * idy;
+                    }
+                    else if (ofm / OUTPUT_FEATURE_NUM == 2) {
+                        output_idx_eltwise = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + (ofm % OUTPUT_FEATURE_NUM) * OUTPUT_FEATURE_PITCH +
+                            (2 * idy + 1) * OUTPUT_Y_PITCH + 2 * idx;
+                        output_idx = (ofm % OUTPUT_FEATURE_NUM) + (2 * idy + 1) * OUTPUT_SIZE_X * ofm_alignment + 2 * idx * ofm_alignment;
+                        idx_for_image = 2 * idx;
+                        idy_for_image = 2 * idy + 1;
+                    }
+                    else if (ofm / OUTPUT_FEATURE_NUM == 3) {
+                        output_idx_eltwise = (iter * FILTER_OFM_MAX * OUTPUT_FEATURE_PITCH) + (ofm % OUTPUT_FEATURE_NUM) * OUTPUT_FEATURE_PITCH +
+                            (2 * idy + 1) * OUTPUT_Y_PITCH + 2 * idx + 1;
+                        output_idx = (ofm % OUTPUT_FEATURE_NUM) + (2 * idy + 1) * OUTPUT_SIZE_X * ofm_alignment + (2 * idx + 1) * ofm_alignment;
+                        idx_for_image = 2 * idx + 1;
+                        idy_for_image = 2 * idy + 1;
+                    }
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+                    half4 output_half1 = {
+                        out1[ofm + 0] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 0 + OUTPUT_FEATURE_PITCH * 0],
+                        out1[ofm + 1] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 0 + OUTPUT_FEATURE_PITCH * 1],
+                        out1[ofm + 2] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 0 + OUTPUT_FEATURE_PITCH * 2],
+                        0 };
+                    IMAGE_WRITE(output, (int2)(idx_for_image, idy_for_image), output_half1);
+                    half4 output_half2 = {
+                        out2[ofm + 0] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 2 + OUTPUT_FEATURE_PITCH * 0],
+                        out2[ofm + 1] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 2 + OUTPUT_FEATURE_PITCH * 1],
+                        out2[ofm + 2] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 2 + OUTPUT_FEATURE_PITCH * 2],
+                        0 };
+                    IMAGE_WRITE(output, (int2)(idx_for_image +2, idy_for_image), output_half2);
+                    half4 output_half3 = {
+                        out3[ofm + 0] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 4 + OUTPUT_FEATURE_PITCH * 0],
+                        out3[ofm + 1] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 4 + OUTPUT_FEATURE_PITCH * 1],
+                        out3[ofm + 2] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 4 + OUTPUT_FEATURE_PITCH * 2],
+                        0 };
+                    IMAGE_WRITE(output, (int2)(idx_for_image+4, idy_for_image), output_half3);
+                    half4 output_half4 = {
+                        out4[ofm + 0] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 6 + OUTPUT_FEATURE_PITCH * 0],
+                        out4[ofm + 1] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 6 + OUTPUT_FEATURE_PITCH * 1],
+                        out4[ofm + 2] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 6 + OUTPUT_FEATURE_PITCH * 2],
+                        0 };
+                    IMAGE_WRITE(output, (int2)(idx_for_image+6, idy_for_image), output_half4);
+#else
+                    output[output_idx_eltwise + OUTPUT_OFFSET + 0] = out1[ofm] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 0];
+                    output[output_idx_eltwise + OUTPUT_OFFSET + 2] = out2[ofm] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 2];
+                    output[output_idx_eltwise + OUTPUT_OFFSET + 4] = out3[ofm] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 4];
+                    output[output_idx_eltwise + OUTPUT_OFFSET + 6] = out4[ofm] + eltw_input[output_idx_eltwise + OUTPUT_OFFSET + 6];
+#endif
+                }
+        }
+}
index 2911701..2e622e9 100644 (file)
 // Input reading operation is always blocked.
 #define BLOCK_LOAD_INPUTS
 
-// for now kernel stride is square
-#define K_WSTRIDE K_STRIDE
-#define K_HSTRIDE K_STRIDE
-
 // need KERNEL width for first output + STRIDE more for each additional.
-#define IN_BLOCK_WIDTH  (K_WIDTH  + K_WSTRIDE * (OUT_BLOCK_WIDTH  - 1))
-#define IN_BLOCK_HEIGHT (K_HEIGHT + K_HSTRIDE * (OUT_BLOCK_HEIGHT - 1))
+#define IN_BLOCK_WIDTH  ((FILTER_SIZE_X - 1) * DILATION_SIZE_X + STRIDE_SIZE_X * (OUT_BLOCK_WIDTH  - 1) + 1)
+#define IN_BLOCK_HEIGHT ((FILTER_SIZE_Y - 1) * DILATION_SIZE_Y + STRIDE_SIZE_Y * (OUT_BLOCK_HEIGHT - 1) + 1)
 
 // for imad we are packing 4 8bit activations per 32 bit SIMD lane
 // if we later add 4bit, then PACK would be 8.
 #define AS_TYPE_N_(type, n, x) as_##type##n(x)
 #define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
 #define AS_INPUT0_TYPE_4(x) AS_TYPE_N(INPUT0_TYPE, 4, x)
+#define AS_FILTER_TYPE_4(x) AS_TYPE_N(FILTER_TYPE, 4, x)
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+#define ALIGN(a, b) ((a % b == 0) ? a : a - a % b + b)
 
 // int8 conv_input and weights data is packed to int32 "batches",
 // int/uint pointers here instead of INPUT0_TYPE/FILTER_TYPE for convenience
 __attribute__((intel_reqd_sub_group_size(SIMD_SIZE)))
+__attribute__((reqd_work_group_size(1, 1, SIMD_SIZE)))
 KERNEL (fused_convolution_eltwise_gpu_imad)(
+#if INPUT0_LAYOUT_B_FS_YX_FSV16
+    const __global INPUT0_TYPE* conv_input,
+#else
     const __global PACKED_TYPE   *conv_input,
-    __global OUTPUT_TYPE         *output,
+#endif
+    __global OUTPUT_TYPE         *restrict output,
     const __global int           *weights,
 #if BIAS_TERM
     const __global BIAS_TYPE     *biases,
@@ -69,48 +74,64 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
 {
     const uint oc = (uint)get_global_id(0) * OUT_BLOCK_WIDTH;  // oc = Output Column
     const uint or = (uint)get_global_id(1) * OUT_BLOCK_HEIGHT; // or = Output Row
-    const uint fm = get_global_id(2);                    // fm = Feature Map = od = Output Depth, SIMD is across this dimension, WG is 1x1x16
+    const uint fm = get_global_id(2);                          // fm = Feature Map = od = Output Depth, SIMD is across this dimension, WG is 1x1x16
     const uint fmg = get_group_id(2);
     const uint lid = get_local_id(2);
-    const uint batch = fm / _OD;
-    const uint f = fm % _OD;
+    const uint batch = fm / (ALIGN(FILTER_OFM_NUM, SIMD_SIZE) * FILTER_GROUPS_NUM);
+#if GROUPED
+    const uint g = (fm / ALIGN(FILTER_OFM_NUM, SIMD_SIZE) % FILTER_GROUPS_NUM);
+    const uint ofmg = fmg % CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE);
+#else
+    const uint g = 0;
+    const uint ofmg = (fmg % (_OD  / SIMD_SIZE));
+#endif
+    const uint f = fm % ALIGN(FILTER_OFM_NUM, SIMD_SIZE) + g * FILTER_OFM_NUM;
+    const uint sglid = get_sub_group_local_id();
+
+    const int input_x = oc * STRIDE_SIZE_X - PADDING_SIZE_X;
+    const int input_y = or * STRIDE_SIZE_Y - PADDING_SIZE_Y;
 
     PACKED_TYPE in[IN_BLOCK_HEIGHT];
     ACCUMULATOR_TYPE out[OUT_BLOCK_WIDTH * OUT_BLOCK_HEIGHT] = { 0 };  // this is the 32 bit signed accumulator that must be converted to 8 bits before final write.
 
-    #define NUM_FILTERS (K_HEIGHT * K_WIDTH)
+    #define NUM_FILTERS (FILTER_SIZE_Y * FILTER_SIZE_X)
     int w[NUM_FILTERS];
-
     int in_addr;
 
 #ifdef BLOCK_LOAD_WEIGHTS
-    int weight_addr = (fmg % ((_OD + SIMD_SIZE - 1) / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK);
+    int weight_addr = (ofmg * CEIL_DIV(FILTER_IFM_NUM, PACK) * FILTER_SIZE_Y * FILTER_SIZE_X * SIMD_SIZE) + (g * FILTER_GROUPS_PITCH / 4);
 #else
-    int weight_addr = (fmg % ((_OD + SIMD_SIZE - 1) / SIMD_SIZE)) * ((_ID * K_HEIGHT * K_WIDTH * SIMD_SIZE) / PACK) + lid;
+    int weight_addr = (ofmg * CEIL_DIV(FILTER_IFM_NUM, PACK) * FILTER_SIZE_Y * FILTER_SIZE_X * SIMD_SIZE) + (g * FILTER_GROUPS_PITCH / 4) + sglid;
 #endif
-
-    uint input_size = (_ID * (_IH + IHPAD) * (_IW + IWPAD)) / PACK; // dividing by PACK to get right number of 32bit entities.
+    uint input_size = (_ID * (INPUT0_SIZE_Y + IHPAD) * (INPUT0_SIZE_X + IWPAD)) / PACK; // dividing by PACK to get right number of 32bit entities.
 
     // For imad we do 4X less input feature map iterations since we are packing 4 of them in each uchar4.
-    // _ID provided by host is multiple of packing factor.
     __attribute__((opencl_unroll_hint(1)))
-    for(int kd = 0; kd < (_ID / PACK); kd++)
+    for(int kd = 0; kd < CEIL_DIV(FILTER_IFM_NUM, PACK); kd++)
     {
-
-#ifdef BLOCK_LOAD_INPUTS
-        in_addr = INPUT0_OFFSET + kd*INPUT0_FEATURE_PITCH + (or * K_STRIDE - PADDING_SIZE_Y)*INPUT0_Y_PITCH + (oc * K_STRIDE - PADDING_SIZE_X);
+#if INPUT0_LAYOUT_B_FS_YX_FSV16
+        in_addr = INPUT0_GET_INDEX(batch, (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * PACK, input_y, input_x + sglid);
 #else
-        in_addr = INPUT0_OFFSET + kd*INPUT0_FEATURE_PITCH + (or * K_STRIDE - PADDING_SIZE_Y)*INPUT0_Y_PITCH + (oc * K_STRIDE - PADDING_SIZE_X) + lid;
-#endif
+    #ifdef BLOCK_LOAD_INPUTS
+        in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x;
+    #else
+        in_addr = INPUT0_OFFSET + (kd + g * CEIL_DIV(FILTER_IFM_NUM, PACK)) * INPUT0_FEATURE_PITCH + input_y * INPUT0_Y_PITCH + input_x + sglid;
+    #endif
         in_addr += batch * input_size;  // adjust for batching
-
+#endif
         for(uint reg = 0; reg < IN_BLOCK_HEIGHT; reg++) {
-#ifdef BLOCK_LOAD_INPUTS
-            in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr]));
+#if INPUT0_LAYOUT_B_FS_YX_FSV16
+            in[reg] = *(__global PACKED_TYPE*)(conv_input + in_addr);
+            in_addr += (INPUT0_SIZE_X + IWPAD) * 16;
 #else
+    #ifdef BLOCK_LOAD_INPUTS
+            in[reg] = AS_PACKED_TYPE(intel_sub_group_block_read(&conv_input[in_addr]));
+    #else
             in[reg] = AS_PACKED_TYPE(conv_input[in_addr]);// read SIMD_SIZE elements wide
+    #endif
+            // TODO This will cause errors for byxf_af32 format on input
+            in_addr += (INPUT0_SIZE_X + IWPAD);  // move to next row down
 #endif
-            in_addr += (_IW + IWPAD);  // move to next row down
         }
 
 #ifdef BLOCK_LOAD_WEIGHTS
@@ -126,17 +147,19 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
 
         int wi = 0;
         // This loop is temporarily not unrolled because the unroll causes TeamCity hangs.
-        //__attribute__((opencl_unroll_hint(K_HEIGHT)))
-        for (int kr = 0; kr < K_HEIGHT; ++kr) // kr = Kernel Row
+        //__attribute__((opencl_unroll_hint(FILTER_SIZE_Y)))
+        for (int kr = 0; kr < FILTER_SIZE_Y; ++kr) // kr = Kernel Row
         {
-            __attribute__((opencl_unroll_hint(K_WIDTH)))
-            for (int kc = 0; kc < K_WIDTH; ++kc) // kc = Kernel Column
+            __attribute__((opencl_unroll_hint(FILTER_SIZE_X)))
+            for (int kc = 0; kc < FILTER_SIZE_X; ++kc) // kc = Kernel Column
             {
+                __attribute__((opencl_unroll_hint))
                 for (int br = 0; br < OUT_BLOCK_HEIGHT; br++) {
+                    __attribute__((opencl_unroll_hint))
                     for (int bc = 0; bc < OUT_BLOCK_WIDTH; bc++) {
-                        PACKED_TYPE input = sub_group_broadcast(in[br * K_HSTRIDE + kr], bc * K_WSTRIDE + kc);
+                        PACKED_TYPE input = sub_group_broadcast(in[br * STRIDE_SIZE_Y + kr * DILATION_SIZE_Y], bc * STRIDE_SIZE_X + kc * DILATION_SIZE_X);
 
-                        out[br * OUT_BLOCK_WIDTH + bc] = TO_ACCUMULATOR_TYPE(IMAD(out[br * OUT_BLOCK_WIDTH + bc], AS_INPUT0_TYPE_4(input), as_char4(w[wi])));
+                        out[br * OUT_BLOCK_WIDTH + bc] = TO_ACCUMULATOR_TYPE(IMAD(out[br * OUT_BLOCK_WIDTH + bc], AS_INPUT0_TYPE_4(input), AS_FILTER_TYPE_4(w[wi])));
                     }
                 }
                 wi++;
@@ -148,7 +171,7 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
     // to calculate out_idx and eltw_idx. Calculate offsets with GET_DATA_B_FS_YX_FSV4_INDEX before
     // entering the loop, and have a simple expressions for indexes inside the loop.
     const uint output_idx_offset = GET_DATA_B_FS_YX_FSV4_INDEX(OUTPUT, batch, f, or, oc);
-    const uint output_row_size_bytes = (_OW + OWPAD) * PACK;
+    const uint output_row_size_bytes = (OUTPUT_SIZE_X + OWPAD) * PACK;
 
 #if HAS_FUSED_OPS && FUSED_OPS_CAN_USE_PRELOAD
     FUSED_OPS_PRELOAD;
@@ -156,14 +179,14 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
 
     for (int r = 0; r < OUT_BLOCK_HEIGHT; r++)
     {
-        #if NEED_TO_VERIFY_OUTPUT_RANGES == 1
+        #if OUTPUT_SIZE_Y % OUT_BLOCK_HEIGHT != 0
         const bool zero_r = or + r >= OUTPUT_SIZE_Y;
         if(!zero_r)
         #endif
         {
         for (int c = 0; c < OUT_BLOCK_WIDTH; c++)
         {
-            #if NEED_TO_VERIFY_OUTPUT_RANGES == 1
+            #if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
             const bool zero_c = oc + c >= OUTPUT_SIZE_X;
             if(!zero_c)
             #endif
@@ -172,6 +195,8 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
                 uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
             #elif OUTPUT_LAYOUT_B_FS_YX_FSV4 == 1
                 uint out_idx = output_idx_offset + r * output_row_size_bytes + (c*PACK);
+            #elif OUTPUT_LAYOUT_B_FS_YX_FSV16 == 1
+                uint out_idx = OUTPUT_GET_INDEX(batch, f, or + r, oc + c);
             #else
                 #error "Incorrect output layout"
             #endif
@@ -188,16 +213,21 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
                 ACTIVATION_TYPE res = TO_ACTIVATION_TYPE(dotProd);
 #endif
 
+                OUTPUT_TYPE final_result;
 #if HAS_FUSED_OPS
     #if FUSED_OPS_CAN_USE_PRELOAD
                 FUSED_OPS_CALC;
     #else
                 FUSED_OPS;
     #endif
-                output[out_idx] = FUSED_OPS_RESULT;
+                final_result = FUSED_OPS_RESULT;
 #else
-                output[out_idx] = TO_OUTPUT_TYPE(res);
+                final_result = TO_OUTPUT_TYPE(res);
+#endif
+#if FILTER_OFM_NUM % SIMD_SIZE != 0
+                if (fmg % CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE) != CEIL_DIV(FILTER_OFM_NUM, SIMD_SIZE) - 1 || sglid < FILTER_OFM_NUM % SIMD_SIZE)
 #endif
+                    output[out_idx] = final_result;
             }// if(!zero_c)
         } // for (int c = 0; c < OUT_BLOCK_WIDTH; c++)
         }// if(!zero_r)
@@ -209,12 +239,13 @@ KERNEL (fused_convolution_eltwise_gpu_imad)(
 #endif
 
 #undef BLOCK_LOAD_INPUTS
-#undef K_WSTRIDE
-#undef K_HSTRIDE
 #undef IN_BLOCK_WIDTH
 #undef IN_BLOCK_HEIGHT
 #undef PACK
 #undef AS_TYPE_N_
 #undef AS_TYPE_N
 #undef AS_INPUT0_TYPE_4
+#undef AS_FILTER_TYPE_4
 #undef NUM_FILTERS
+#undef CEIL_DIV
+#undef ALIGN
index 8d35591..eb8032f 100644 (file)
@@ -14,6 +14,8 @@
 // limitations under the License.
 */
 
+#include "mmad.cl"
+
 // TODO: currently we calculate on float32 because it's lot of "add" operation and it stuck on the value "8192.0f"
 #if !defined(ACCUMULATOR_TYPE)
     #define ACCUMULATOR_TYPE float
 #endif
 
 // Creates vector type.
-#define MAKE_VECTOR_TYPE(elem_type, size) CAT(elem_type, size)
\ No newline at end of file
+#define MAKE_VECTOR_TYPE_IMPL_1(elem_type)  elem_type
+#define MAKE_VECTOR_TYPE_IMPL_2(elem_type)  CAT(elem_type, 2)
+#define MAKE_VECTOR_TYPE_IMPL_3(elem_type)  CAT(elem_type, 3)
+#define MAKE_VECTOR_TYPE_IMPL_4(elem_type)  CAT(elem_type, 4)
+#define MAKE_VECTOR_TYPE_IMPL_8(elem_type)  CAT(elem_type, 8)
+#define MAKE_VECTOR_TYPE_IMPL_16(elem_type) CAT(elem_type, 16)
+#define MAKE_VECTOR_TYPE(elem_type, size)   CAT(MAKE_VECTOR_TYPE_IMPL_, size)(elem_type)
+
+#define AS_TYPE(type, val) CAT(as_, type)(val)
+
+// ====================================================================================================================
+// TYPE_SIZE(type) - evaluates to size of "type" in bytes
+// type [PP] - Must evaluate to non-vectorized type.
+// ====================================================================================================================
+#define TYPE_SIZE_uchar  1
+#define TYPE_SIZE_char   1
+#define TYPE_SIZE_ushort 2
+#define TYPE_SIZE_short  2
+#define TYPE_SIZE_half   2
+#define TYPE_SIZE_int    4
+#define TYPE_SIZE_uint   4
+#define TYPE_SIZE_float  4
+#define TYPE_SIZE(type) CAT(TYPE_SIZE_, type)
+
+// ====================================================================================================================
+// BLOCK_READN(type, vector_size, ptr, offset)
+//    - evaluates to intel_sub_group_block_read operation for specified "type" and "vector size", reading
+//      "vector_size" elements from memory starting at "ptr" + "offset"
+// BLOCK_WRITEN(type, vector_size, ptr, offset, val)
+//    - evaluates to intel_sub_group_block_write operation for specified "type" and "vector size", writing
+//      "vector_size"-element vector "val" to memory starting at "ptr" + "offset"
+//  For more details and description of intel_sub_group_block_read/write functions please,
+//  refer to cl_intel_subgroups extension documentation.
+//
+// type        [PP] - Must evaluate to non-vectorized type, ex. float, half, char, etc..
+// vector_size [PP] - Number of elements to read/write, ex 2 for intel_sub_group_block_read2.
+// ptr              - Pointer to global memory where to read from/write to.
+// offset           - Additional offset added to ptr in "type" elements, equivalent to passing ((ptr) + (offset)) as "ptr".
+// val              - For write function vector of "vector_size" of "type" elements (or scalar) to write.
+//
+// ====================================================================================================================
+// Pre-defined commonly used definitions:
+//   DT_<tensor>_BLOCK_READ<n>(ptr, offset)
+//   DT_<tensor>_BLOCK_WRITE<n>(ptr, offset, offset)
+// Where:
+//    <tensor> is one of: INPUT - referencing type jitted as INPUT0,
+//                        OUTPUT,
+//                        BIAS,
+//                        FILTER
+//    <n> is a vector size, one of {2,4,8,16} or none, meaning the output will be a scalar
+// 
+// ====================================================================================================================
+#define BLOCK_RW_TYPE_size1 uchar
+#define BLOCK_RW_TYPE_size2 ushort
+#define BLOCK_RW_TYPE_size4 uint
+#define BLOCK_RW_TYPE(type_size) CAT(BLOCK_RW_TYPE_size, type_size)
+
+#define BLOCK_READ_FUNC_size2       intel_sub_group_block_read_us
+#define BLOCK_READ_FUNC_size4       intel_sub_group_block_read
+#define BLOCK_READ_FUNC(type_size)  CAT(BLOCK_READ_FUNC_size, type_size)
+
+#define BLOCK_WRITE_FUNC_size2       intel_sub_group_block_write_us
+#define BLOCK_WRITE_FUNC_size4       intel_sub_group_block_write
+#define BLOCK_WRITE_FUNC(type_size)  CAT(BLOCK_WRITE_FUNC_size, type_size)
+
+#define BLOCK_READN_FUNC_size1(vector_size)                 CAT(BLOCK_READ_UC_, vector_size)
+#define BLOCK_READN_FUNC_SIZE_DEF(type_size, vector_size)   MAKE_VECTOR_TYPE(BLOCK_READ_FUNC(type_size), vector_size)
+#define BLOCK_READN_FUNC_size2(vector_size)                 BLOCK_READN_FUNC_SIZE_DEF(2, vector_size)
+#define BLOCK_READN_FUNC_size4(vector_size)                 BLOCK_READN_FUNC_SIZE_DEF(4, vector_size)
+#define BLOCK_READN_FUNC(type_size, vector_size)            CAT(BLOCK_READN_FUNC_size, type_size)(vector_size)
+
+#define BLOCK_WRITEN_FUNC_size1(vector_size)                CAT(BLOCK_WRITE_UC_, vector_size)
+#define BLOCK_WRITEN_FUNC_SIZE_DEF(type_size, vector_size)  MAKE_VECTOR_TYPE(BLOCK_WRITE_FUNC(type_size), vector_size)
+#define BLOCK_WRITEN_FUNC_size2(vector_size)                BLOCK_WRITEN_FUNC_SIZE_DEF(2, vector_size)
+#define BLOCK_WRITEN_FUNC_size4(vector_size)                BLOCK_WRITEN_FUNC_SIZE_DEF(4, vector_size)
+#define BLOCK_WRITEN_FUNC(type_size, vector_size)           CAT(BLOCK_WRITEN_FUNC_size, type_size)(vector_size)
+
+#define BLOCK_READN_RAW(type_size, vector_size, ptr, offset)                                                    \
+    BLOCK_READN_FUNC(type_size, vector_size)((const __global BLOCK_RW_TYPE(type_size)*)(ptr) + (offset))
+#define BLOCK_WRITEN_RAW(type_size, vector_size, ptr, offset, val)                                              \
+    BLOCK_WRITEN_FUNC(type_size, vector_size)(                                                                  \
+        (__global BLOCK_RW_TYPE(type_size)*)(ptr) + (offset),                                                   \
+        AS_TYPE(MAKE_VECTOR_TYPE(BLOCK_RW_TYPE(type_size), vector_size), val))
+
+#define BLOCK_READN(type, vector_size, ptr, offset)                                                             \
+    AS_TYPE(MAKE_VECTOR_TYPE(type, vector_size), BLOCK_READN_RAW(TYPE_SIZE(type), vector_size, ptr, offset))
+#define BLOCK_WRITEN(type, vector_size, ptr, offset, val)                                                       \
+    BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, ptr, offset, val)
+
+#define DT_INPUT_BLOCK_READ(ptr, offset)            BLOCK_READN(INPUT0_TYPE, 1, ptr, offset)
+#define DT_INPUT_BLOCK_READ2(ptr, offset)           BLOCK_READN(INPUT0_TYPE, 2, ptr, offset)
+#define DT_INPUT_BLOCK_READ4(ptr, offset)           BLOCK_READN(INPUT0_TYPE, 4, ptr, offset)
+#define DT_INPUT_BLOCK_READ8(ptr, offset)           BLOCK_READN(INPUT0_TYPE, 8, ptr, offset)
+#define DT_INPUT_BLOCK_READ16(ptr, offset)          BLOCK_READN(INPUT0_TYPE, 16, ptr, offset)
+
+#define DT_INPUT_BLOCK_WRITE(ptr, offset, val)      BLOCK_WRITEN(INPUT0_TYPE, 1, ptr, offset, val)
+#define DT_INPUT_BLOCK_WRITE2(ptr, offset, val)     BLOCK_WRITEN(INPUT0_TYPE, 2, ptr, offset, val)
+#define DT_INPUT_BLOCK_WRITE4(ptr, offset, val)     BLOCK_WRITEN(INPUT0_TYPE, 4, ptr, offset, val)
+#define DT_INPUT_BLOCK_WRITE8(ptr, offset, val)     BLOCK_WRITEN(INPUT0_TYPE, 8, ptr, offset, val)
+#define DT_INPUT_BLOCK_WRITE16(ptr, offset, val)    BLOCK_WRITEN(INPUT0_TYPE, 16, ptr, offset, val)
+
+#define DT_OUTPUT_BLOCK_READ(ptr, offset)           BLOCK_READN(OUTPUT_TYPE, 1, ptr, offset)
+#define DT_OUTPUT_BLOCK_READ2(ptr, offset)          BLOCK_READN(OUTPUT_TYPE, 2, ptr, offset)
+#define DT_OUTPUT_BLOCK_READ4(ptr, offset)          BLOCK_READN(OUTPUT_TYPE, 4, ptr, offset)
+#define DT_OUTPUT_BLOCK_READ8(ptr, offset)          BLOCK_READN(OUTPUT_TYPE, 8, ptr, offset)
+#define DT_OUTPUT_BLOCK_READ16(ptr, offset)         BLOCK_READN(OUTPUT_TYPE, 16, ptr, offset)
+
+#define DT_OUTPUT_BLOCK_WRITE(ptr, offset, val)     BLOCK_WRITEN(OUTPUT_TYPE, 1, ptr, offset, val)
+#define DT_OUTPUT_BLOCK_WRITE2(ptr, offset, val)    BLOCK_WRITEN(OUTPUT_TYPE, 2, ptr, offset, val)
+#define DT_OUTPUT_BLOCK_WRITE4(ptr, offset, val)    BLOCK_WRITEN(OUTPUT_TYPE, 4, ptr, offset, val)
+#define DT_OUTPUT_BLOCK_WRITE8(ptr, offset, val)    BLOCK_WRITEN(OUTPUT_TYPE, 8, ptr, offset, val)
+#define DT_OUTPUT_BLOCK_WRITE16(ptr, offset, val)   BLOCK_WRITEN(OUTPUT_TYPE, 16, ptr, offset, val)
+
+#define DT_BIAS_BLOCK_READ(ptr, offset)             BLOCK_READN(BIAS_TYPE, 1, ptr, offset)
+#define DT_BIAS_BLOCK_READ2(ptr, offset)            BLOCK_READN(BIAS_TYPE, 2, ptr, offset)
+#define DT_BIAS_BLOCK_READ4(ptr, offset)            BLOCK_READN(BIAS_TYPE, 4, ptr, offset)
+#define DT_BIAS_BLOCK_READ8(ptr, offset)            BLOCK_READN(BIAS_TYPE, 8, ptr, offset)
+#define DT_BIAS_BLOCK_READ16(ptr, offset)           BLOCK_READN(BIAS_TYPE, 16, ptr, offset)
+
+#define DT_BIAS_BLOCK_WRITE(ptr, offset, val)       BLOCK_WRITEN(BIAS_TYPE, 1, ptr, offset, val)
+#define DT_BIAS_BLOCK_WRITE2(ptr, offset, val)      BLOCK_WRITEN(BIAS_TYPE, 2, ptr, offset, val)
+#define DT_BIAS_BLOCK_WRITE4(ptr, offset, val)      BLOCK_WRITEN(BIAS_TYPE, 4, ptr, offset, val)
+#define DT_BIAS_BLOCK_WRITE8(ptr, offset, val)      BLOCK_WRITEN(BIAS_TYPE, 8, ptr, offset, val)
+#define DT_BIAS_BLOCK_WRITE16(ptr, offset, val)     BLOCK_WRITEN(BIAS_TYPE, 16, ptr, offset, val)
+
+#define DT_FILTER_BLOCK_READ(ptr, offset)           BLOCK_READN(FILTER_TYPE, 1, ptr, offset)
+#define DT_FILTER_BLOCK_READ2(ptr, offset)          BLOCK_READN(FILTER_TYPE, 2, ptr, offset)
+#define DT_FILTER_BLOCK_READ4(ptr, offset)          BLOCK_READN(FILTER_TYPE, 4, ptr, offset)
+#define DT_FILTER_BLOCK_READ8(ptr, offset)          BLOCK_READN(FILTER_TYPE, 8, ptr, offset)
+#define DT_FILTER_BLOCK_READ16(ptr, offset)         BLOCK_READN(FILTER_TYPE, 16, ptr, offset)
+
+#define DT_FILTER_BLOCK_WRITE(ptr, offset, val)     BLOCK_WRITEN(FILTER_TYPE, 1, ptr, offset, val)
+#define DT_FILTER_BLOCK_WRITE2(ptr, offset, val)    BLOCK_WRITEN(FILTER_TYPE, 2, ptr, offset, val)
+#define DT_FILTER_BLOCK_WRITE4(ptr, offset, val)    BLOCK_WRITEN(FILTER_TYPE, 4, ptr, offset, val)
+#define DT_FILTER_BLOCK_WRITE8(ptr, offset, val)    BLOCK_WRITEN(FILTER_TYPE, 8, ptr, offset, val)
+#define DT_FILTER_BLOCK_WRITE16(ptr, offset, val)   BLOCK_WRITEN(FILTER_TYPE, 16, ptr, offset, val)
+// ====================================================================================================================
index 41a8853..99ac419 100644 (file)
@@ -284,6 +284,32 @@ inline uint FUNC(get_b_fs_yx_fsv_index_safe)(uint b, uint f, uint y, uint x,
         CAT(prefix, _OFFSET)                                                              \
     )
 
+#define GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(prefix, o, i, y, x) \
+    FUNC_CALL(get_os_is_yx_osv16_isv16_index)(                    \
+        o, i, y, x,                                               \
+        CAT(prefix, _SIZE_X),                                     \
+        CAT(prefix, _SIZE_Y),                                     \
+        CAT(prefix, _IFM_NUM),                                    \
+        CAT(prefix, _OFM_NUM))
+
+inline uint FUNC(get_os_is_yx_osv16_isv16_index)(uint o, uint i, uint y, uint x,
+    uint x_size, uint y_size, uint i_size, uint o_size)
+{
+    const uint isv = i % 16;
+    const uint osv = o % 16;
+    const uint is = i / 16;
+    const uint os = o / 16;
+
+    const uint x_pitch = 16 * 16;
+    const uint y_pitch = x_pitch * x_size;
+    const uint is_pitch = y_pitch * y_size;
+    const uint os_pitch = is_pitch * ((i_size + 16 - 1) / 16);
+
+    const uint output_offset = isv + osv * 16 + x * x_pitch + y * y_pitch + is * is_pitch + os * os_pitch;
+
+    return output_offset;
+}
+
 #define GET_FILTER_G_OS_IS_YX_ISV8_OSV16_ISV2_INDEX(prefix, g, o, i, y, x, sub_group_size) \
     FUNC_CALL(get_os_is_zyx_isv8_osv16_isv2_index)(                                        \
         g, o, i, 0, y, x,                                                                  \
@@ -847,6 +873,45 @@ inline uint FUNC(get_b_fs_yx_fsv4)(uint o, uint i, uint y, uint x,
     return idx;
 }
 
+#define GET_FILTER_G_OS_IS_YX_OSV16_ISV4_INDEX(prefix, g, o, i, y, x) \
+    FUNC_CALL(get_g_os_is_yx_osv16_isv4)(                         \
+        g, o, i, y, x,                                            \
+        CAT(prefix, _IFM_PITCH),                                 \
+        CAT(prefix, _OFM_PITCH),                                 \
+        CAT(prefix, _SIZE_X),                                    \
+        CAT(prefix, _SIZE_Y),                                    \
+        CAT(prefix, _OFM_NUM),                                   \
+        CAT(prefix, _IFM_NUM))
+
+inline uint FUNC(get_g_os_is_yx_osv16_isv4)(uint g, uint o, uint i, uint y, uint x,
+                                          uint i_size,
+                                          uint o_size,
+                                          uint x_size,
+                                          uint y_size,
+                                          uint o_num,
+                                          uint i_num)
+{
+    const uint otd = 16;
+    uint out_depth_tile = o / otd;
+    uint od             = o - out_depth_tile * otd;
+    uint output_slice_size = (o_num + otd - 1) / otd;
+
+    const uint tile = 4;
+    uint id_tile = i / tile;
+    uint id      = i - id_tile * tile;
+    uint input_slice_size = (i_num + tile - 1) / tile;
+
+    uint idx = g * output_slice_size * input_slice_size * y_size * x_size * otd * tile
+                                       + out_depth_tile * (o_size / tile) * otd * tile
+                                       + id_tile                 * i_size * otd * tile
+                                       + y                       * x_size * otd * tile
+                                       + x                                * otd * tile
+                                       + od                                     * tile
+                                       + id;
+
+    return idx;
+}
+
 #define GET_FILTER_OS_IS_YX_OSV16_ISV4_INDEX(prefix, o, i, y, x) \
     FUNC_CALL(get_os_is_yx_osv16_isv4)(                          \
         o, i, y, x,                                              \
@@ -1239,9 +1304,8 @@ inline uint FUNC(get_os_is_osv32_isv32_swizzled_by_4_index)(uint o, uint i, uint
         CAT(prefix, _OFM_NUM),\
         CAT(prefix, _OFFSET))
 
-inline uint FUNC(get_os_i_yxs_osv4_yxsv4_index)(uint o, uint i, uint y, uint x, uint i_size, uint size_x, uint size_y) {
+inline uint FUNC(get_os_i_yxs_osv_yxsv4_index)(uint o, uint i, uint y, uint x, uint i_size, uint size_x, uint size_y, uint osv) {
     const uint yxsv = 4;
-    const uint osv = 4;
     uint yx = y * size_x + x;
     uint yx_size_aligned = (size_x * size_y + yxsv - 1) / yxsv * yxsv;
     uint os_index = o / osv;
@@ -1259,11 +1323,12 @@ inline uint FUNC(get_os_i_yxs_osv4_yxsv4_index)(uint o, uint i, uint y, uint x,
 }
 
 #define GET_FILTER_OS_I_YXS_OSV4_YXSV4_INDEX(prefix, o, i, y, x)    \
-    FUNC_CALL(get_os_i_yxs_osv4_yxsv4_index)(                       \
+    FUNC_CALL(get_os_i_yxs_osv_yxsv4_index)(                        \
         o, i, y, x,                                                 \
         CAT(prefix, _IFM_NUM),                                      \
         CAT(prefix, _SIZE_X),                                       \
-        CAT(prefix, _SIZE_Y))
+        CAT(prefix, _SIZE_Y),                                       \
+        4)
 
 #define GET_FILTER_OS_IYX_OSV32__AI32_INDEX(prefix, o, i, y, x, sub_group_size) \
     CAT(prefix, _OFFSET) +                                                      \
@@ -1380,9 +1445,8 @@ inline uint FUNC(get_os_i_yxs_osv4_yxsv4_index)(uint o, uint i, uint y, uint x,
         CAT(prefix, _OFFSET),                                                       \
         sub_group_size)
 
-inline uint FUNC(get_gs_oi_yxs_gsv4_yxsv4_index)(uint g, uint o, uint i, uint y, uint x, uint o_size, uint i_size, uint size_x, uint size_y) {
+inline uint FUNC(get_gs_oi_yxs_gsv_yxsv4_index)(uint g, uint o, uint i, uint y, uint x, uint o_size, uint i_size, uint size_x, uint size_y, const uint gsv) {
     const uint yxsv = 4;
-    const uint gsv = 4;
     uint yx = y * size_x + x;
     uint yx_size_aligned = (size_x * size_y + yxsv - 1) / yxsv * yxsv;
     uint gs_index = g / gsv;
@@ -1400,12 +1464,31 @@ inline uint FUNC(get_gs_oi_yxs_gsv4_yxsv4_index)(uint g, uint o, uint i, uint y,
 }
 
 #define GET_FILTER_GS_OI_YXS_GSV4_YXSV4_INDEX(prefix, g, o, i, y, x) \
-    FUNC_CALL(get_gs_oi_yxs_gsv4_yxsv4_index)(                       \
+    FUNC_CALL(get_gs_oi_yxs_gsv_yxsv4_index)(                        \
         g, o, i, y, x,                                               \
         CAT(prefix, _OFM_NUM),                                       \
         CAT(prefix, _IFM_NUM),                                       \
         CAT(prefix, _SIZE_X),                                        \
-        CAT(prefix, _SIZE_Y))
+        CAT(prefix, _SIZE_Y),                                        \
+        4)
+
+#define GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(prefix, g, o, i, y, x) \
+    FUNC_CALL(get_gs_oi_yxs_gsv_yxsv4_index)(                         \
+        g, o, i, y, x,                                                \
+        CAT(prefix, _OFM_NUM),                                        \
+        CAT(prefix, _IFM_NUM),                                        \
+        CAT(prefix, _SIZE_X),                                         \
+        CAT(prefix, _SIZE_Y),                                         \
+        16)
+
+#define GET_FILTER_GS_OI_YXS_GSV32_YXSV4_INDEX(prefix, g, o, i, y, x) \
+    FUNC_CALL(get_gs_oi_yxs_gsv_yxsv4_index)(                         \
+        g, o, i, y, x,                                                \
+        CAT(prefix, _OFM_NUM),                                        \
+        CAT(prefix, _IFM_NUM),                                        \
+        CAT(prefix, _SIZE_X),                                         \
+        CAT(prefix, _SIZE_Y),                                         \
+        32)
 
 #define GET_FILTER_G_OS_IS_YX_ISV16_OSV16_INDEX(prefix, g, o, i, y, x, sub_group_size) \
     CAT(prefix, _OFFSET) +                                                             \
index 2b1f501..80fab34 100644 (file)
@@ -177,6 +177,64 @@ inline int8 FUNC(mmad8x8)(int8 A_vectors, int8 B_vectors, int8 acc) __attribute_
 }
 
 // TODO: remove it when cl_intel_subgroups_char extension will work
+inline void FUNC(sub_group_block_write_uchar16)(__global uchar* outPtr, uchar16 v)
+{
+#ifdef cl_intel_subgroups_char
+    intel_sub_group_block_write_uc16(outPtr, v);
+#else
+    uint idx = get_sub_group_local_id();
+
+    outPtr[idx] = v.s0; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s1; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s2; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s3; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s4; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s5; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s6; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s7; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s8; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s9; idx += get_max_sub_group_size();
+    outPtr[idx] = v.sa; idx += get_max_sub_group_size();
+    outPtr[idx] = v.sb; idx += get_max_sub_group_size();
+    outPtr[idx] = v.sc; idx += get_max_sub_group_size();
+    outPtr[idx] = v.sd; idx += get_max_sub_group_size();
+    outPtr[idx] = v.se; idx += get_max_sub_group_size();
+    outPtr[idx] = v.sf; idx += get_max_sub_group_size();
+#endif
+}
+
+inline uchar16 FUNC(sub_group_block_read_uchar16)(const __global uchar* ptr)
+{
+#ifdef cl_intel_subgroups_char
+    // WA for compiler support
+    // return intel_sub_group_block_read_uc16(ptr);
+    return (uchar16)(intel_sub_group_block_read_uc8(ptr), intel_sub_group_block_read_uc8(ptr + 8 * get_max_sub_group_size()));
+#else
+    uint idx = get_sub_group_local_id();
+
+    uchar16 ret;
+
+    ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s2 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s3 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s4 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s5 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s6 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s7 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s8 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s9 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.sa = ptr[idx]; idx += get_max_sub_group_size();
+    ret.sb = ptr[idx]; idx += get_max_sub_group_size();
+    ret.sc = ptr[idx]; idx += get_max_sub_group_size();
+    ret.sd = ptr[idx]; idx += get_max_sub_group_size();
+    ret.se = ptr[idx]; idx += get_max_sub_group_size();
+    ret.sf = ptr[idx]; idx += get_max_sub_group_size();
+
+    return ret;
+#endif
+}
+
 inline void FUNC(sub_group_block_write_uchar8)(__global uchar* outPtr, uchar8 v)
 {
 #ifdef cl_intel_subgroups_char
@@ -184,7 +242,7 @@ inline void FUNC(sub_group_block_write_uchar8)(__global uchar* outPtr, uchar8 v)
 #else
     uint idx = get_sub_group_local_id();
 
-       outPtr[idx] = v.s0; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s0; idx += get_max_sub_group_size();
     outPtr[idx] = v.s1; idx += get_max_sub_group_size();
     outPtr[idx] = v.s2; idx += get_max_sub_group_size();
     outPtr[idx] = v.s3; idx += get_max_sub_group_size();
@@ -214,7 +272,92 @@ inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr)
     ret.s7 = ptr[idx]; idx += get_max_sub_group_size();
 
     return ret;
+#endif
+}
+
+inline void FUNC(sub_group_block_write_uchar4)(__global uchar* outPtr, uchar4 v)
+{
+#ifdef cl_intel_subgroups_char
+    intel_sub_group_block_write_uc4(outPtr, v);
+#else
+    uint idx = get_sub_group_local_id();
+
+    outPtr[idx] = v.s0; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s1; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s2; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s3; idx += get_max_sub_group_size();
+#endif
+}
+
+inline uchar4 FUNC(sub_group_block_read_uchar4)(const __global uchar* ptr)
+{
+#ifdef cl_intel_subgroups_char
+    return intel_sub_group_block_read_uc4(ptr);
+#else
+    uint idx = get_sub_group_local_id();
+
+    uchar4 ret;
+
+    ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s2 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s3 = ptr[idx]; idx += get_max_sub_group_size();
+
+    return ret;
+#endif
+}
+
+inline void FUNC(sub_group_block_write_uchar2)(__global uchar* outPtr, uchar2 v)
+{
+#ifdef cl_intel_subgroups_char
+    intel_sub_group_block_write_uc2(outPtr, v);
+#else
+    uint idx = get_sub_group_local_id();
 
+    outPtr[idx] = v.s0; idx += get_max_sub_group_size();
+    outPtr[idx] = v.s1; idx += get_max_sub_group_size();
+#endif
+}
+
+inline uchar2 FUNC(sub_group_block_read_uchar2)(const __global uchar* ptr)
+{
+#ifdef cl_intel_subgroups_char
+    return intel_sub_group_block_read_uc2(ptr);
+#else
+    uint idx = get_sub_group_local_id();
+
+    uchar2 ret;
+
+    ret.s0 = ptr[idx]; idx += get_max_sub_group_size();
+    ret.s1 = ptr[idx]; idx += get_max_sub_group_size();
+
+    return ret;
+#endif
+}
+
+inline void FUNC(sub_group_block_write_uchar)(__global uchar* outPtr, uchar v)
+{
+#ifdef cl_intel_subgroups_char
+    intel_sub_group_block_write_uc(outPtr, v);
+#else
+    uint idx = get_sub_group_local_id();
+
+    outPtr[idx] = v;
+#endif
+}
+
+inline uchar FUNC(sub_group_block_read_uchar)(const __global uchar* ptr)
+{
+#ifdef cl_intel_subgroups_char
+    return intel_sub_group_block_read_uc(ptr);
+#else
+    uint idx = get_sub_group_local_id();
+
+    uchar ret;
+
+    ret = ptr[idx];
+
+    return ret;
 #endif
 }
 
@@ -227,3 +370,15 @@ inline uchar8 FUNC(sub_group_block_read_uchar8)(const __global uchar* ptr)
 #define SLM_BLOCK_WRITE_4(A, B) (FUNC_CALL(intel_sub_group_block_write_4)(A, B))
 #define SLM_BLOCK_READ_4(A) (FUNC_CALL(intel_sub_group_block_read_uint4)(A))
 #define SLM_BLOCK_READ_8(A) (FUNC_CALL(intel_sub_group_block_read_uint8)(A))
+
+#define BLOCK_READ_UC_1(ptr)  FUNC_CALL(sub_group_block_read_uchar)(ptr)
+#define BLOCK_READ_UC_2(ptr)  FUNC_CALL(sub_group_block_read_uchar2)(ptr)
+#define BLOCK_READ_UC_4(ptr)  FUNC_CALL(sub_group_block_read_uchar4)(ptr)
+#define BLOCK_READ_UC_8(ptr)  FUNC_CALL(sub_group_block_read_uchar8)(ptr)
+#define BLOCK_READ_UC_16(ptr) FUNC_CALL(sub_group_block_read_uchar16)(ptr)
+
+#define BLOCK_WRITE_UC_1(ptr, val)  FUNC_CALL(sub_group_block_write_uchar)(ptr, val)
+#define BLOCK_WRITE_UC_2(ptr, val)  FUNC_CALL(sub_group_block_write_uchar2)(ptr, val)
+#define BLOCK_WRITE_UC_4(ptr, val)  FUNC_CALL(sub_group_block_write_uchar4)(ptr, val)
+#define BLOCK_WRITE_UC_8(ptr, val)  FUNC_CALL(sub_group_block_write_uchar8)(ptr, val)
+#define BLOCK_WRITE_UC_16(ptr, val) FUNC_CALL(sub_group_block_write_uchar16)(ptr, val)
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad.cl
new file mode 100644 (file)
index 0000000..59e1f5f
--- /dev/null
@@ -0,0 +1,537 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/fetch.cl"
+#include "include/imad.cl"
+#include "include/data_types.cl"
+#include "include/common.cl"
+#include "include/mmad.cl"
+
+#include "mvn_gpu_b_fs_yx_fsv16_imad_accumulate.cl"
+#include "mvn_gpu_b_fs_yx_fsv16_imad_reduce.cl"
+
+// MVN - performs mean-variance normalization, that is normalizes the input data to have
+//       0 mean and if NORMALIZE_VARIANCE is set to have variance 1.
+//
+// Below is a set of 5 kernels:
+//   mvn_mean_1, mvn_mean_2, mvn_var_1, mvn_var_2, mvn_final
+// that can perform mvn operation in two modes.
+//
+// Basic mode:
+//   In this mode only mvn_final kernel is used. It performs required reductions for mean
+//   and variance in this single kernel using single work-group for slice of data-sets
+//   and reducing intermidiate values with local memory.
+//   It does not require any additional jit constants.
+//   lws:          LWS x 1 x 1
+//   gws:          LWS x feature x batch
+//
+// Parallel mode:
+//   In this mode all kernels are used to provide extra paralellism with global memory
+//   and host side synchronization with evets/in-order queue.
+//   To calculate mean:
+//   mvn_mean_1 kernel should be first enqueued, provided extra global memory on second input
+//     allowing to store intermidate results from all work-groups.
+//     To activate this kernel MVN_KERNEL_MEAN_1 must be defined and evaluate to true/1.
+//     lws:           LWS x 1 x 1
+//     gws:           LWS * ITEM_GROUPS x feature x batch
+//     This kernel will calculate partial results for each ITEM_GROUPS work-groups and store it into global memory.
+//
+//   mvn_mean_2 kernel must be next enqueued in order to further reduce previous results using single work-group.
+//     This kernel expects on first input the result of mvn_mean_1 and on second input global memory of size
+//     batch * align(feature, FSV) should be provided to store final mean values.
+//     It needs to be ensured that mvn_mean_1 kernel has finished and stored its partial results into memory.
+//     To activate this kernel MVN_KERNEL_MEAN_2 must be defined and evaluate to true/1.
+//     lws:          LWS x 1 x 1
+//     gws:          LWS x feature x batch
+//
+//  If required analogously the mvn_var_1 and mvn_var_2 kernels should be enqueud, additionally providing results from
+//  mvn_mean_2 kernel.
+//
+//  Finally the mvn_final kernel should be enqueued with provided buffers with outputs from previous kernels (mvn_mean_2, mvn_var_2).
+//  To enable parallel mode PRECALC_MEAN and optionally PRECALC_VARIANCE definitions should be used.
+//  As at this stage there is no further need to synchronize and this kernel will perform simple normalization given known mean and inverse of variance.
+//  Due to this this kernel can be enqueued with full paralellization, not limiting it to single work-group.
+//     lws:          SIMD x 1 x 1
+//     gws:          (x * y) / SIMD * SIMD x feature x batch
+//
+// Required jit constants:
+// SIMD         - Sub-group/simd size.
+// LWS          - Local work-size along 0th dimension, must be multiple of SIMD.
+// GWS          - Global work-size along 0th dimension.
+//                In basic mode this must be equal to LWS.
+//                In parallel mode this must be equal to LWS * ITEM_GROUPS, except in mvn_final kernel where it has no restrictions.
+// ITEM_GROUPS  - Number of work-groups performing accumulation in parallel mode. Should be the same in both stages of parallel kernels.
+
+
+#define FSV                   16
+#define INPUT_SLICE_PITCH     16
+#define SG_NUM                (LWS / SIMD)
+
+#define INPUT_TYPE2           MAKE_VECTOR_TYPE(INPUT0_TYPE, 2)
+#define INPUT_TYPE4           MAKE_VECTOR_TYPE(INPUT0_TYPE, 4)
+#define INPUT_TYPE8           MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
+#define INPUT_PACKED_TYPE     MAKE_VECTOR_TYPE(INPUT0_TYPE, FSV)
+#define OUTPUT_PACKED_TYPE    MAKE_VECTOR_TYPE(OUTPUT_TYPE, FSV)
+#define MEAN_PACKED_TYPE      MAKE_VECTOR_TYPE(MEAN_TYPE, FSV)
+#define INT_PACKED_TYPE       MAKE_VECTOR_TYPE(int, FSV)
+
+#define TO_MEAN_PACKED_TYPE   CAT(convert_, MEAN_PACKED_TYPE)
+
+#define ITEMS_NUM             (OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
+
+#define CEIL_DIV(a, b)        (((a) + (b) - 1) / (b))
+
+// ================================================================================================
+#if MVN_KERNEL_MEAN_1
+
+DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, GWS, ACCUMULATE_SUM)
+
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_sum_across_sg, int, FSV, SG_NUM, REDUCE_NO_POST_OP)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_mean_1)(const __global INPUT0_TYPE* input,
+                   __global int* intermidiate_sum) {
+    uint b = get_global_id(2);
+    uint f = get_global_id(1) * FSV;
+    uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+
+    uint items_group = get_group_id(0);
+    const uint sgid = get_sub_group_id();
+    const uint sglid = get_sub_group_local_id();
+
+    const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0);
+
+    __local int slm_acc[(SG_NUM - 1) * FSV];
+
+    INT_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_global_id(0));
+    int full_sum = FUNC_CALL(reduce_sum_across_sg)(partial_sum, slm_acc);
+
+    if (sgid == 0 && (sglid < FSV || SIMD == FSV)) {
+        intermidiate_sum[flat_data_set_group * ITEM_GROUPS * FSV + items_group * FSV + sglid] = full_sum;
+    }
+}
+// ================================================================================================
+#elif MVN_KERNEL_MEAN_2
+
+DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, int, FSV, INPUT_SLICE_PITCH, ITEM_GROUPS, LWS, ACCUMULATE_SUM)
+
+#define CALC_MEAN(sum) ((sum) / ITEMS_NUM)
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_mean_across_sg, MEAN_TYPE, FSV, SG_NUM, CALC_MEAN)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_mean_2)(const __global int* intermidiate_sum,
+                   __global MEAN_TYPE* intermidiate_mean) {
+    uint b = get_global_id(2);
+    uint f = get_global_id(1) * FSV;
+    uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+
+    const uint sgid = get_sub_group_id();
+    const uint sglid = get_sub_group_local_id();
+
+    const uint data_sets_offset = flat_data_set_group * ITEM_GROUPS * FSV;
+
+    INT_PACKED_TYPE complete_sum = FUNC_CALL(accumulate_sum_input)(intermidiate_sum, data_sets_offset, get_local_id(0));
+    __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
+    MEAN_TYPE mean = FUNC_CALL(reduce_mean_across_sg)(TO_MEAN_PACKED_TYPE(complete_sum), slm_acc);
+
+    if (sgid == 0 && (sglid < FSV || SIMD == FSV)) {
+        intermidiate_mean[flat_data_set_group * FSV + sglid] = mean;
+    }
+}
+// ================================================================================================
+#elif MVN_KERNEL_VAR_1
+
+#define EXTRA_ARGS_DECL_IMPL    , MEAN_TYPE mean
+#define EXTRA_ARGS_IMPL         , mean
+#define EXTRA_ARGS_DECL         EXTRA_ARGS_DECL_IMPL
+#define EXTRA_ARGS              EXTRA_ARGS_IMPL
+#define ACCUMULATE_SUM_SQ_DEV(curr, next, idx, mean)   ACCUMULATE_SUM_SQ(curr, TO_MEAN_TYPE(next) - intel_sub_group_shuffle(mean, idx), idx)
+DECLARE_PACKED_ACCUMULATE_EARGS(accumulate_sum_sq_dev, MEAN_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, GWS, ACCUMULATE_SUM_SQ_DEV, EXTRA_ARGS_DECL, EXTRA_ARGS)
+
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_sum_across_sg, MEAN_TYPE, FSV, SG_NUM, REDUCE_NO_POST_OP)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_var_1)(const __global INPUT0_TYPE* input,
+                  const __global MEAN_TYPE* means,
+                  __global MEAN_TYPE* intermidiate_sum) {
+    uint b = get_global_id(2);
+    uint f = get_global_id(1) * FSV;
+    uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+
+    uint items_group = get_group_id(0);
+    const uint sgid = get_sub_group_id();
+    const uint sglid = get_sub_group_local_id();
+
+    const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0);
+
+    __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
+
+    MEAN_TYPE mean = means[flat_data_set_group * FSV + sglid];
+    MEAN_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_sq_dev)(input, data_sets_offset, get_global_id(0), mean);
+    MEAN_TYPE full_sum = FUNC_CALL(reduce_sum_across_sg)(partial_sum, slm_acc);
+
+    if (sgid == 0 && (sglid < FSV || SIMD == FSV)) {
+        intermidiate_sum[flat_data_set_group * ITEM_GROUPS * FSV + items_group * FSV + sglid] = full_sum;
+    }
+}
+// ================================================================================================
+#elif MVN_KERNEL_VAR_2
+
+DECLARE_PACKED_ACCUMULATE(accumulate_sum, MEAN_TYPE, MEAN_TYPE, FSV, INPUT_SLICE_PITCH, ITEM_GROUPS, LWS, ACCUMULATE_SUM)
+
+#define CALC_INVERSE_VARIANCE(sum_diff_sq)   native_powr((sum_diff_sq) / ITEMS_NUM + (MEAN_TYPE)EPSILON, -0.5f)
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_var_across_sg, MEAN_TYPE, FSV, SG_NUM, CALC_INVERSE_VARIANCE)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_var_2)(const __global MEAN_TYPE* intermidiate_sum,
+                   __global MEAN_TYPE* intermidiate_ivar) {
+    uint b = get_global_id(2);
+    uint f = get_global_id(1) * FSV;
+    uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+
+    uint items_group = get_group_id(0);
+    const uint sgid = get_sub_group_id();
+    const uint sglid = get_sub_group_local_id();
+
+    const uint data_sets_offset = flat_data_set_group * ITEM_GROUPS * FSV;
+
+    MEAN_PACKED_TYPE complete_sum = FUNC_CALL(accumulate_sum)(intermidiate_sum, data_sets_offset, get_local_id(0));
+
+    __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
+    MEAN_TYPE inv_variance = FUNC_CALL(reduce_var_across_sg)(complete_sum, slm_acc);
+
+    if (sgid == 0 && (sglid < FSV || SIMD == FSV)) {
+        intermidiate_ivar[flat_data_set_group * FSV + sglid] = inv_variance;
+    }
+}
+// ================================================================================================
+#else // MVN_KERNEL_MAIN
+
+// Mean:
+DECLARE_PACKED_ACCUMULATE(accumulate_sum_input, int, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, LWS, ACCUMULATE_SUM)
+
+#define CALC_MEAN(sum) ((sum) / ITEMS_NUM)
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_mean, MEAN_TYPE, FSV, SG_NUM, CALC_MEAN)
+
+// Variance:
+#define EXTRA_ARGS_DECL_IMPL    , MEAN_TYPE mean
+#define EXTRA_ARGS_IMPL         , mean
+#define EXTRA_ARGS_DECL         EXTRA_ARGS_DECL_IMPL
+#define EXTRA_ARGS              EXTRA_ARGS_IMPL
+#define ACCUMULATE_SUM_SQ_DEV(curr, next, idx, mean)   ACCUMULATE_SUM_SQ(curr, next - intel_sub_group_shuffle(mean, idx), idx)
+DECLARE_PACKED_ACCUMULATE_EARGS(accumulate_sum_sq_dev, MEAN_TYPE, INPUT0_TYPE, FSV, INPUT_SLICE_PITCH, ITEMS_NUM, LWS, ACCUMULATE_SUM_SQ_DEV, EXTRA_ARGS_DECL, EXTRA_ARGS)
+
+#define CALC_INVERSE_VARIANCE(sum_diff_sq)   native_powr((sum_diff_sq) / ITEMS_NUM + (MEAN_TYPE)EPSILON, -0.5f)
+DECLARE_WG_PACKED_REDUCE_ADD(reduce_inverse_variance, MEAN_TYPE, FSV, SG_NUM, CALC_INVERSE_VARIANCE)
+
+#define INPUT_PACKED_BLOCK_READ(ptr)   CAT(as_, INPUT_PACKED_TYPE)(CAT(BLOCK_READ_UC_, FSV)((const __global uchar*)ptr))
+
+#define OUTPUT_PAD_IN_ITEMS (OUTPUT_PAD_BEFORE_SIZE_X != 0 || OUTPUT_PAD_AFTER_SIZE_X != 0 || OUTPUT_PAD_BEFORE_SIZE_Y != 0)
+
+__attribute__((intel_reqd_sub_group_size(SIMD)))
+__attribute__((reqd_work_group_size(LWS, 1, 1)))
+KERNEL(mvn_final)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* restrict output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+#if PRECALC_MEAN
+    , const __global MEAN_TYPE* means
+#endif
+#if PRECALC_VARIANCE
+    , const __global MEAN_TYPE* variances
+#endif
+) {
+    uint b = get_global_id(2);
+    uint f = get_global_id(1) * FSV;
+    uint flat_data_set_group = b * CEIL_DIV(OUTPUT_FEATURE_NUM, FSV) + get_global_id(1);
+#if GWS != LWS
+    uint items_group = get_group_id(0);
+#else
+    uint items_group = 0;
+#endif
+    const uint sgid = get_sub_group_id() + items_group * SG_NUM;
+    const uint sglid = get_sub_group_local_id();
+
+    const uint data_sets_offset = INPUT0_GET_INDEX(b, f, 0, 0);
+    uint input_offset;
+
+#if !PRECALC_MEAN || (NORMALIZE_VARIANCE && !PRECALC_VARIANCE)
+    __local MEAN_TYPE slm_acc[(SG_NUM - 1) * FSV];
+#endif
+
+#if PRECALC_MEAN
+    MEAN_TYPE mean = means[flat_data_set_group * FSV + sglid];
+#else
+    INT_PACKED_TYPE partial_sum = FUNC_CALL(accumulate_sum_input)(input, data_sets_offset, get_local_id(0));
+    MEAN_TYPE mean = FUNC_CALL(reduce_mean)(TO_MEAN_PACKED_TYPE(partial_sum), slm_acc);
+#endif
+
+#if NORMALIZE_VARIANCE
+#   if PRECALC_VARIANCE
+    MEAN_TYPE inv_variance = variances[flat_data_set_group * FSV + sglid];
+#   else
+    MEAN_PACKED_TYPE partial_dev = FUNC_CALL(accumulate_sum_sq_dev)(input, data_sets_offset, get_local_id(0), mean);
+    MEAN_TYPE inv_variance = FUNC_CALL(reduce_inverse_variance)(partial_dev, slm_acc);
+#   endif
+#else
+    MEAN_TYPE inv_variance = 1;
+#endif
+
+#if OUTPUT_IS_FP
+    input_offset = data_sets_offset + sgid * SIMD * FSV;
+    uint output_spatial_base = sgid * SIMD;
+    uint output_offset = OUTPUT_GET_INDEX(b, f, 0, 0) + sgid * SIMD * FSV;
+    // For fused ops to align with non-fp path
+    const uint set_idx = sglid;
+
+    for (uint spatial_idx = 0; spatial_idx < ITEMS_NUM / GWS; ++spatial_idx) {
+        INPUT_PACKED_TYPE in_pack = INPUT_PACKED_BLOCK_READ(input + input_offset);
+
+        __attribute__((opencl_unroll_hint))
+        for (uint si = 0; si < SIMD; ++si) {
+            uint output_spatial = output_spatial_base + si;
+            MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[si]) - mean) * inv_variance;
+            OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+                FUSED_OPS;
+                result = FUSED_OPS_RESULT;
+#else
+                result = TO_OUTPUT_TYPE(normalized);
+#endif
+#if !OUTPUT_PAD_IN_ITEMS
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset + si * SIMD, result);
+#else
+            uint x = output_spatial % OUTPUT_SIZE_X;
+            uint y = output_spatial / OUTPUT_SIZE_X;
+            output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset, result);
+#endif
+        }
+        input_offset += GWS * FSV;
+        output_offset += GWS * FSV;
+        output_spatial_base += GWS;
+    }
+
+    // [constexpr] Number of leftovers after full local work-group iterations.
+    const uint lws_uniform_leftovers = ITEMS_NUM % GWS;
+    // [constexpr] Number of sub-groups that can process leftovers loading SIMD items.
+    const uint lws_uniform_leftovers_full_simds = lws_uniform_leftovers / SIMD;
+    // [constexpr] Number of leftovers after full sub-group processing.
+    const uint sg_uniform_leftovers = lws_uniform_leftovers % SIMD;
+
+    if (lws_uniform_leftovers_full_simds > 0 && sgid < lws_uniform_leftovers_full_simds) {
+        // Process leftovers that can use full sub-group.
+        INPUT_PACKED_TYPE in_pack = INPUT_PACKED_BLOCK_READ(input + input_offset);
+
+        __attribute__((opencl_unroll_hint))
+        for (uint si = 0; si < SIMD; ++si) {
+            uint output_spatial = output_spatial_base + si;
+            MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[si]) - mean) * inv_variance;
+            OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+                FUSED_OPS;
+                result = FUSED_OPS_RESULT;
+#else
+                result = TO_OUTPUT_TYPE(normalized);
+#endif
+#if !OUTPUT_PAD_IN_ITEMS
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset + si * SIMD, result);
+#else
+            uint x = output_spatial % OUTPUT_SIZE_X;
+            uint y = output_spatial / OUTPUT_SIZE_X;
+            output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset, result);
+#endif
+        }
+    } else if (lws_uniform_leftovers > 0 &&
+               sg_uniform_leftovers > 0 &&
+               sgid == lws_uniform_leftovers_full_simds) {
+        // TODO: May be worth to consider the data here as across sub-group
+        // Rest of leftovers, still use whole sub-group, but change addresses to not load extra data.
+        INPUT_PACKED_TYPE in_pack;
+        uint pack_idx = 0;
+        if (sg_uniform_leftovers >= 8) {
+            INPUT_TYPE8 tmp_in = DT_INPUT_BLOCK_READ8(input, input_offset + pack_idx * SIMD);
+            in_pack[pack_idx + 0] = tmp_in[0];
+            in_pack[pack_idx + 1] = tmp_in[1];
+            in_pack[pack_idx + 2] = tmp_in[2];
+            in_pack[pack_idx + 3] = tmp_in[3];
+            in_pack[pack_idx + 4] = tmp_in[4];
+            in_pack[pack_idx + 5] = tmp_in[5];
+            in_pack[pack_idx + 6] = tmp_in[6];
+            in_pack[pack_idx + 7] = tmp_in[7];
+            pack_idx += 8;
+        }
+        if (sg_uniform_leftovers % 8 >= 4) {
+            INPUT_TYPE4 tmp_in = DT_INPUT_BLOCK_READ4(input, input_offset + pack_idx * SIMD);
+            in_pack[pack_idx + 0] = tmp_in[0];
+            in_pack[pack_idx + 1] = tmp_in[1];
+            in_pack[pack_idx + 2] = tmp_in[2];
+            in_pack[pack_idx + 3] = tmp_in[3];
+            pack_idx += 4;
+        }
+        if (sg_uniform_leftovers % 4 >= 2) {
+            INPUT_TYPE2 tmp_in = DT_INPUT_BLOCK_READ2(input, input_offset + pack_idx * SIMD);
+            in_pack[pack_idx + 0] = tmp_in[0];
+            in_pack[pack_idx + 1] = tmp_in[1];
+            pack_idx += 2;
+        }
+        if (sg_uniform_leftovers % 2 == 1) {
+            in_pack[pack_idx] = DT_INPUT_BLOCK_READ(input, input_offset + pack_idx * SIMD);
+        }
+
+        OUTPUT_PACKED_TYPE result;
+        __attribute__((opencl_unroll_hint))
+        for (uint si = 0; si < sg_uniform_leftovers; ++si) {
+            uint output_spatial = output_spatial_base + si;
+            MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[si]) - mean) * inv_variance;
+            OUTPUT_TYPE result;
+#if HAS_FUSED_OPS
+            FUSED_OPS;
+            result = FUSED_OPS_RESULT;
+#else
+            result = TO_OUTPUT_TYPE(normalized);
+#endif
+#if !OUTPUT_PAD_IN_ITEMS
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset + si * SIMD, result);
+#else
+            uint x = output_spatial % OUTPUT_SIZE_X;
+            uint y = output_spatial / OUTPUT_SIZE_X;
+            output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset, result);
+#endif
+        }
+    }
+#else // => !OUTPUT_IS_FP
+    input_offset = data_sets_offset + sgid * SIMD * FSV;
+    uint output_offset = OUTPUT_GET_INDEX(b, f, 0, 0) + sgid * SIMD * FSV;
+    uint output_spatial = sgid * SIMD + sglid;
+
+    for (uint spatial_idx = 0; spatial_idx < ITEMS_NUM / GWS; ++spatial_idx) {
+        INPUT_PACKED_TYPE in_pack = ((const __global INPUT_PACKED_TYPE*)(input + input_offset))[sglid];
+
+        OUTPUT_PACKED_TYPE result;
+        __attribute__((opencl_unroll_hint))
+        for (uint set_idx = 0; set_idx < FSV; ++set_idx) {
+            MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[set_idx]) - intel_sub_group_shuffle(mean, set_idx)) * intel_sub_group_shuffle(inv_variance, set_idx);
+            #if HAS_FUSED_OPS
+                FUSED_OPS;
+                result[set_idx] = FUSED_OPS_RESULT;
+            #else
+                result[set_idx] = TO_OUTPUT_TYPE(normalized);
+            #endif
+        }
+#if !OUTPUT_PAD_IN_ITEMS
+        ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[sglid] = result;
+#else
+        uint x = output_spatial % OUTPUT_SIZE_X;
+        uint y = output_spatial / OUTPUT_SIZE_X;
+        output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+        ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[0] = result;
+#endif
+
+        input_offset += GWS * FSV;
+        output_offset += GWS * FSV;
+        output_spatial += GWS;
+    }
+
+    // [constexpr] Number of leftovers after full local work-group iterations.
+    const uint lws_uniform_leftovers = ITEMS_NUM % GWS;
+    // [constexpr] Number of sub-groups that can process leftovers loading SIMD items.
+    const uint lws_uniform_leftovers_full_simds = lws_uniform_leftovers / SIMD;
+    // [constexpr] Number of leftovers after full sub-group processing.
+    const uint sg_uniform_leftovers = lws_uniform_leftovers % SIMD;
+
+    if (lws_uniform_leftovers_full_simds > 0 && sgid < lws_uniform_leftovers_full_simds) {
+        // Process leftovers that can use full sub-group.
+        INPUT_PACKED_TYPE in_pack = ((const __global INPUT_PACKED_TYPE*)(input + input_offset))[sglid];
+
+        OUTPUT_PACKED_TYPE result;
+        __attribute__((opencl_unroll_hint))
+        for (uint set_idx = 0; set_idx < FSV; ++set_idx) {
+            MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[set_idx]) - intel_sub_group_shuffle(mean, set_idx)) * intel_sub_group_shuffle(inv_variance, set_idx);
+            #if HAS_FUSED_OPS
+                FUSED_OPS;
+                result[set_idx] = FUSED_OPS_RESULT;
+            #else
+                result[set_idx] = TO_OUTPUT_TYPE(normalized);
+            #endif
+        }
+#if !OUTPUT_PAD_IN_ITEMS
+        ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[sglid] = result;
+#else
+        uint x = output_spatial % OUTPUT_SIZE_X;
+        uint y = output_spatial / OUTPUT_SIZE_X;
+        output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+        ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[0] = result;
+#endif
+    } else if (lws_uniform_leftovers > 0 &&
+               sg_uniform_leftovers > 0 &&
+               sgid == lws_uniform_leftovers_full_simds) {
+        // TODO: May be worth to consider the data here as across sub-group
+        // Rest of leftovers, still use whole sub-group, but change addresses to not load extra data.
+        INPUT_PACKED_TYPE in_pack = ((const __global INPUT_PACKED_TYPE*)(input + input_offset))[sglid % sg_uniform_leftovers];
+
+        OUTPUT_PACKED_TYPE result;
+        __attribute__((opencl_unroll_hint))
+        for (uint set_idx = 0; set_idx < FSV; ++set_idx) {
+            MEAN_TYPE normalized = (TO_MEAN_TYPE(in_pack[set_idx]) - intel_sub_group_shuffle(mean, set_idx)) * intel_sub_group_shuffle(inv_variance, set_idx);
+            #if HAS_FUSED_OPS
+                FUSED_OPS;
+                result[set_idx] = FUSED_OPS_RESULT;
+            #else
+                result[set_idx] = TO_OUTPUT_TYPE(normalized);
+            #endif
+        }
+        if (sglid < sg_uniform_leftovers) {
+#if !OUTPUT_PAD_IN_ITEMS
+            ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[sglid] = result;
+#else
+            uint x = output_spatial % OUTPUT_SIZE_X;
+            uint y = output_spatial / OUTPUT_SIZE_X;
+            output_offset = OUTPUT_GET_INDEX(b, f, y, x);
+            ((__global OUTPUT_PACKED_TYPE*)(output + output_offset))[0] = result;
+#endif
+        }
+    }
+#endif
+}
+
+#endif
+// ================================================================================================
+
+#undef FSV
+#undef INPUT_SLICE_PITCH
+#undef SG_NUM
+
+#undef INPUT_TYPE2
+#undef INPUT_TYPE4
+#undef INPUT_TYPE8
+#undef INPUT_PACKED_TYPE
+#undef OUTPUT_PACKED_TYPE
+#undef INT_PACKED_TYPE
+#undef MEAN_PACKED_TYPE
+#undef TO_MEAN_PACKED_TYPE
+
+#undef INPUT_PACKED_BLOCK_READ
+#undef OUTPUT_PAD_IN_ITEMS
+
+#undef CEIL_DIV
+#undef USE_IMAD
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad_accumulate.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad_accumulate.cl
new file mode 100644 (file)
index 0000000..fbbc8a2
--- /dev/null
@@ -0,0 +1,107 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/data_types.cl"
+
+// ==============================================================================================================================
+// DECLARE_PACKED_ACCUMULATE(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp)
+// DECLARE_PACKED_ACCUMULATE_EARGS(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp, ExtraArgsDecl, ExtraArgs)
+//
+// Declares function "Name" performing parallel packed accumulation:
+// AccT<SliceSize> Name (const __global InputT* input, uint offset, uint worker_id  ExtraArgsDecl)
+//
+// Template arguments:
+//   Name             - Name of function to declare.
+//   AccT             - Type of accumulator variable. Can't be vector type. Examples: int, float, half.
+//   InputT           - Type of input data. Can't be vector type. Examples: int, float, half.
+//   SliceSize        - Number values in packed slice to accumulate in each work-item. One of: 2, 4, 8, 16.
+//   SlicePitch       - Pitch between consecutive input slices in "input".
+//   Items            - Total number of items to accumulate across all work-items.
+//   Workers          - Number of work-items performing accumulation.
+//   AccOp              Name of operation used to perform accumulation.
+//                      Calling it "function-like" must return value of new accumulation variable.
+//                      Expected interface:
+//                          AccT AccOp(AccT current, InputT val, uint index ExtraArgs)
+//                          current - current accumulation value
+//                          val - currently processed input value
+//                          index - number of item inside slice currently processed
+//                          ExtraArgs - optional extra arguments passed as is from template argument
+//                          returns: new accumulator value after accumulating "val" with "current"
+//   ExtraArgsDecl    - Optional extra arguments declaration to pass to function.
+//   ExtraArgs        - Optional extra arguments to pass to "AccOp" using names declared in "ExtraArgsDecl".
+//
+// Function arguments:
+//   input          - Pointer to global memory from which values will be read to accumulate
+//   offset         - Offset into "input" from where accumulation should start
+//   worker_id      - Number of current work-item
+//   ExtraArgsDecl  - Optional extra arguments, declared from template argument.
+// 
+// Pseduocode:
+//  function Name(input, offset, worker_id, ExtraArgs... eargs) {
+//      AccT<SliceSize> accumulator = 0;
+//      for (uint idx = worker_id; idx < Items; idx += Workers) {
+//          InputT<SliceSize> in = vload<SliceSize>(0, &input[offset + idx * SlicePitch];
+//          for (uint si = 0; si < SliceSize; ++si) {
+//              accumulator[si] = AccOp(accumulator[si], in[si], si, eargs...)
+//          }
+//      }
+//      return accumulator;
+//  }
+//
+// ==============================================================================================================================
+
+#define ACCUMULATE_SUM(a, b, idx)       ((a) + (b))
+#define ACCUMULATE_SUM_SQ(a, b, idx)    ((a) + ((b) * (b)))
+
+#define DECLARE_PACKED_ACCUMULATE_EARGS(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp, ExtraArgsDecl, ExtraArgs)     \
+inline MAKE_VECTOR_TYPE(AccT, SliceSize) FUNC(Name)(const __global InputT* input,                                                       \
+                                                    uint offset,                                                                        \
+                                                    uint worker_id                                                                      \
+                                                    ExtraArgsDecl) {                                                                    \
+    typedef MAKE_VECTOR_TYPE(InputT, SliceSize) packed_in_t;                                                                            \
+    typedef MAKE_VECTOR_TYPE(AccT, SliceSize) packed_acc_t;                                                                             \
+                                                                                                                                        \
+    packed_acc_t acc = 0;  /* Accumulation variable */                                                                                  \
+                                                                                                                                        \
+    uint input_offset = offset + worker_id * (SlicePitch);  /* Current input offset */                                                  \
+                                                                                                                                        \
+    /* Uniform loop to help compiler in unrolling */                                                                                    \
+    for (uint spatial_idx = 0; spatial_idx < (Items) / (Workers); ++spatial_idx) {                                                      \
+        packed_in_t in_pack = ((const __global packed_in_t*)(input + input_offset))[0];                                                 \
+                                                                                                                                        \
+        input_offset += (Workers) * (SlicePitch);                                                                                       \
+                                                                                                                                        \
+        __attribute__((opencl_unroll_hint))                                                                                             \
+        for (uint set_idx = 0; set_idx < (SliceSize); ++set_idx) {                                                                      \
+            acc[set_idx] = AccOp(acc[set_idx], in_pack[set_idx], set_idx  ExtraArgs);                                                   \
+        }                                                                                                                               \
+    }                                                                                                                                   \
+                                                                                                                                        \
+    /* [constexpr] Number of leftovers after all uniform iterations */                                                                  \
+    const uint leftovers = (Items) % (Workers);                                                                                         \
+                                                                                                                                        \
+    if (leftovers > 0 && worker_id < leftovers) {                                                                                       \
+        packed_in_t in_pack = ((const __global packed_in_t*)(input + input_offset))[0];                                                 \
+                                                                                                                                        \
+        __attribute__((opencl_unroll_hint))                                                                                             \
+        for (uint set_idx = 0; set_idx < (SliceSize); ++set_idx) {                                                                      \
+            acc[set_idx] = AccOp(acc[set_idx], in_pack[set_idx], set_idx  ExtraArgs);                                                   \
+        }                                                                                                                               \
+    }                                                                                                                                   \
+                                                                                                                                        \
+    return acc;                                                                                                                         \
+}
+
+#define DECLARE_PACKED_ACCUMULATE(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp)                                     \
+    DECLARE_PACKED_ACCUMULATE_EARGS(Name, AccT, InputT, SliceSize, SlicePitch, Items, Workers, AccOp, , )
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad_reduce.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/mvn_gpu_b_fs_yx_fsv16_imad_reduce.cl
new file mode 100644 (file)
index 0000000..1b61b79
--- /dev/null
@@ -0,0 +1,125 @@
+// Copyright (c) 2020 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+
+// ==============================================================================================================================
+// DECLARE_WG_PACKED_REDUCE_ADD(Name, Type, VecSize, SgNum, PostOp)
+//
+// Declares function "Name" performing work-group reduction on vector data, using addition operator:
+//   Type Name (Type<VecSize> value, __local Type* slm_acc)
+// Returns reduction result as sub-group vector, for example when VecSize equals 4:
+//   work-item for which get_sub_group_local_id() == 0 will hold reduced values from value.s0
+//   work-item for which get_sub_group_local_id() == 1 will hold reduced values from value.s1
+//   work-item for which get_sub_group_local_id() == 2 will hold reduced values from value.s2
+//   work-item for which get_sub_group_local_id() == 3 will hold reduced values from value.s2
+//  for other work-items in sub-group the result will be undefined.
+// All work-items in sub-group must enter declared function.
+//
+// Template arguments:
+//   Name    - Name of function to declare.
+//   Type    - Type of values to reduce.  Can't be vector type. Examples: int, float, half.
+//   VecSize - Vector size of input, one of 2,4,8,16. Must be smaller or equal to sub-group size.
+//   SgNum   - Number of sub-groups inside work-group.
+//   PostOp  - Operation to perform on reduced values.
+//             Called as PostOp(value), where "value" is reduction result, and call should evaluate to expression returning final result.
+//
+// Function arguments:
+//   value   - vector of "VecSize" elements of "Type" holding values to reduce.
+//   slm_acc - pointer to local memory used for reduction. Must have size of at least ("SgNum" - 1) * "VecSize".
+//
+// Pseudocode:
+//  function Name(value, slm_acc) {
+//      Type result;
+//      for (uint vi = 0; vi < VecSize; ++vi) {
+//          Type tmp = work_group_reduce_add(value[vi]);
+//          if (get_sub_group_local_id() == vi) {
+//              result = tmp;
+//          }
+//      }
+//      return result;
+// }
+//
+// Notes:
+//   If local memory is going to be reused additiona barrier(CLK_LOCAL_MEM_FENCE) is required to ensure that all usage inside
+//   declared function has finished.
+// ==============================================================================================================================
+
+#define REDUCE_NO_POST_OP(val) (val)
+
+#define DECLARE_WG_PACKED_REDUCE_ADD(Name, Type, VecSize, SgNum, PostOp)                                                \
+    inline Type FUNC(Name) (MAKE_VECTOR_TYPE(Type, VecSize) value, __local Type* slm_acc) {                             \
+        typedef MAKE_VECTOR_TYPE(Type, VecSize) packed_t;                                                               \
+                                                                                                                        \
+        Type result;                                                                                                    \
+                                                                                                                        \
+        /* [uniform] Current sub-groups id */                                                                           \
+        const uint sgid = get_sub_group_id();                                                                           \
+        /* Id of work-item inside sub-group */                                                                          \
+        const uint sglid = get_sub_group_local_id();                                                                    \
+        /* [constexpr] Maximum simd/sub-group size */                                                                   \
+        const uint simd = get_max_sub_group_size();                                                                     \
+                                                                                                                        \
+        /* Accumulation inside sub-group */                                                                             \
+        packed_t acc;  /* [uniform] Accumulator variable */                                                             \
+        __attribute__((opencl_unroll_hint))                                                                             \
+        for (uint idx = 0; idx < VecSize; ++idx) {                                                                      \
+            acc[idx] = sub_group_reduce_add(value[idx]);                                                                \
+        }                                                                                                               \
+        if ((SgNum) != 1) {                                                                                             \
+            /* More than one sub-group in work-group, reduce using local memory */                                      \
+            /* Store partial results into local memory from sub-groups other than first one */                          \
+            if (sgid != 0 && (sglid < VecSize || simd == VecSize)) {                                                    \
+                slm_acc[(sgid - 1) * VecSize + sglid] = acc[sglid];                                                     \
+            }                                                                                                           \
+            barrier(CLK_LOCAL_MEM_FENCE);                                                                               \
+            /* Accumulate partial results inside first sub-group */                                                     \
+            if (sgid == 0) {                                                                                            \
+                __attribute__((opencl_unroll_hint))                                                                     \
+                for (uint vi = 0; vi < VecSize; ++vi) {                                                                 \
+                    /* Accumulate single vector element using sub_group_reduce_add */                                   \
+                    /* Last work-item inside sub-group holds previous value (iteration or sub-group reduction stage) */ \
+                                                                                                                        \
+                    Type tmp = acc[vi];                                                                                 \
+                    __attribute__((opencl_unroll_hint))                                                                 \
+                    for (uint sg = 0; sg < (SgNum) - 1; sg += (simd - 1)) {                                             \
+                        bool last_sglid = sglid == simd - 1;                                                            \
+                        bool sglid_inside_sgs = sg + simd - 1 <= (SgNum) - 1 || sg + sglid < (SgNum) - 1;               \
+                        Type tmp_in_slm = slm_acc[sg * VecSize + sglid * VecSize + vi];                                 \
+                        tmp = last_sglid ? tmp :                                                                        \
+                              sglid_inside_sgs ? tmp_in_slm                                                             \
+                              : 0;                                                                                      \
+                        tmp = sub_group_reduce_add(tmp);                                                                \
+                    }                                                                                                   \
+                    acc[vi] = tmp;                                                                                      \
+                }                                                                                                       \
+                if (sglid < VecSize || simd == VecSize) {                                                               \
+                    result = PostOp(acc[sglid]);                                                                        \
+                    slm_acc[sglid] = result;                                                                            \
+                }                                                                                                       \
+            }                                                                                                           \
+            barrier(CLK_LOCAL_MEM_FENCE);                                                                               \
+            /* Read result in all other sub-groups */                                                                   \
+            if (sgid != 0 && (sglid < VecSize || simd == VecSize)) {                                                    \
+                result = slm_acc[sglid];                                                                                \
+            }                                                                                                           \
+        } else {                                                                                                        \
+            /* Single sub-group case, just transpose the data to correct layout */                                      \
+            if (sglid < VecSize || simd == VecSize) {                                                                   \
+                result = PostOp(acc[sglid]);                                                                            \
+                slm_acc[sglid] = result;                                                                                \
+            }                                                                                                           \
+        }                                                                                                               \
+        return result;                                                                                                  \
+    }
index 6c8d8e2..cdb4cd1 100644 (file)
@@ -87,7 +87,7 @@ KERNEL(pooling_gpu_int8_ref)(
 #elif OUTPUT_LAYOUT_B_FS_YX_FSV16
     const uint x = get_global_id(1);
     const uint y = get_global_id(2);
-    const uint bf = get_global_id(0);
+    const uint bf = (uint)get_global_id(0);
     const uint f = bf / INPUT0_BATCH_NUM;
     const uint b = bf % INPUT0_BATCH_NUM;
     const uint z = 0;
index 445d69a..acb9f6d 100644 (file)
@@ -18,6 +18,9 @@
 
 #include "include/data_types.cl"
 
+#define INPUT_TYPE4 MAKE_VECTOR_TYPE(INPUT_REORDER_TYPE, 4)
+#define OUTPUT_TYPE4 MAKE_VECTOR_TYPE(OUTPUT_REORDER_TYPE, 4)
+
 ///////////////////////// Input Index /////////////////////////
 inline uint FUNC(get_input_index)(uint b, uint f, uint w, uint z, uint y, uint x)
 {
@@ -48,12 +51,16 @@ inline uint FUNC(get_output_index)(uint b, uint f, uint w, uint z, uint y, uint
 }
 
 KERNEL (reorder_data)(
-#if defined INPUT0_LAYOUT_NV12
+#if INPUT0_LAYOUT_NV12 || INPUT0_LAYOUT_IMAGE_2D_RGBA
     read_only image2d_t input,
 #else
     const __global INPUT_REORDER_TYPE* input,
 #endif
+#if OUTPUT_LAYOUT_IMAGE_2D_RGBA
+    write_only image2d_t output
+#else
     __global OUTPUT_REORDER_TYPE* output
+#endif
 #ifdef MEAN_SUBTRACT_IN_BUFFER
     , __global MEAN_SUBTRACT_TYPE* mean_subtract
 #endif
@@ -95,7 +102,7 @@ KERNEL (reorder_data)(
 #if defined INPUT0_LAYOUT_NV12
     const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_CLAMP;
     float4 colorVYU = read_imagef(input, sampler, (int2)(x, y));
-    
+
     float Ycomponent = mad(colorVYU.s1, 296.82f, -18.624f);
     float Ucomponent = mad(colorVYU.s2, 255.0f, -128.f);
     float Vcomponent = mad(colorVYU.s0, 255.0f, -128.f);
@@ -103,7 +110,20 @@ KERNEL (reorder_data)(
     float B = clamp(mad(Vcomponent, 1.596f, Ycomponent), 0.f, 255.f);
     float R = clamp(mad(Ucomponent, 2.018f, Ycomponent), 0.f, 255.f);
     float G = clamp(mad(Vcomponent, -0.813f, mad(Ucomponent, -0.391f, Ycomponent)), 0.f, 255.f);
-
+#elif defined INPUT0_LAYOUT_IMAGE_2D_RGBA
+    const sampler_t imageSampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_CLAMP;
+    OUTPUT_TYPE4 colorRGBA = IMAGE_READ(input, (int2)(x, y));
+#elif defined OUTPUT_LAYOUT_IMAGE_2D_RGBA
+    uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, f, w, z, y, x);
+    const uint input_idx_R  = FUNC_CALL(get_input_index)(b, 0, w, z, y, x);
+    const uint input_idx_G  = FUNC_CALL(get_input_index)(b, 1, w, z, y, x);
+    const uint input_idx_B  = FUNC_CALL(get_input_index)(b, 2, w, z, y, x);
+#if OUTPUT_FEATURE_NUM == 3
+    INPUT_TYPE4 colorRGBA = { TO_INPUT_REORDER_TYPE(input[input_idx_R]), TO_INPUT_REORDER_TYPE(input[input_idx_G]), TO_INPUT_REORDER_TYPE(input[input_idx_B]), TO_INPUT_REORDER_TYPE(0.f) };
+#else
+    const uint input_idx_A  = FUNC_CALL(get_input_index)(b, 3, w, z, y, x);
+    INPUT_TYPE4 colorRGBA = { TO_INPUT_REORDER_TYPE(input[input_idx_R]), TO_INPUT_REORDER_TYPE(input[input_idx_G]), TO_INPUT_REORDER_TYPE(input[input_idx_B]), TO_INPUT_REORDER_TYPE(input[input_idx_A]) };
+#endif
 #else
     uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, f, w, z, y, x);
     const uint input_idx  = FUNC_CALL(get_input_index)(b, f, w, z, y, x);
@@ -137,6 +157,23 @@ KERNEL (reorder_data)(
     ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
     output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
     output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(B), NL_M, NL_N);
+#elif INPUT0_LAYOUT_IMAGE_2D_RGBA
+    uint8 ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 0, w, z, y, x);
+    uint output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s0), NL_M, NL_N);
+    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 1, w, z, y, x);
+    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s1), NL_M, NL_N);
+    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 2, w, z, y, x);
+    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s2), NL_M, NL_N);
+#if INPUT0_FEATURE_NUM == 4
+    ov = RESHAPE_DIMS(INPUT0, OUTPUT, b, 3, w, z, y, x);
+    output_idx = FUNC_CALL(get_output_index)(ov[1], ov[2], ov[3], ov[4], ov[5], ov[6]);
+    output[output_idx] = ACTIVATION_FUNC_TYPED(OUTPUT_REORDER, TO_OUTPUT_REORDER_TYPE(colorRGBA.s3), NL_M, NL_N);
+#endif
+#elif OUTPUT_LAYOUT_IMAGE_2D_RGBA
+    IMAGE_WRITE(output, (int2)(x, y), colorRGBA);
 #else
 #if INPUT0_IS_FP && !OUTPUT_IS_FP
     // TODO: check if this round really needed. Right now it's added to have the same behavior as CPU plugin
@@ -147,3 +184,6 @@ KERNEL (reorder_data)(
 #endif
 #endif
 }
+
+#undef INPUT_TYPE4
+#undef OUTPUT_TYPE4
index 07b69f2..5a42d46 100644 (file)
@@ -91,6 +91,12 @@ inline uint FUNC(get_input_index)(uint g, uint o, uint i, uint z, uint y, uint x
 #elif defined INPUT0_LAYOUT_GYXIO || \
       defined INPUT0_LAYOUT_GOIYX
     return GET_FILTER_GOIYX(INPUT0, g, o, i, y, x);
+#elif defined INPUT0_LAYOUT_OS_IS_YX_OSV16_ISV16
+    return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(INPUT0, o, i, y, x);
+#elif defined INPUT0_LAYOUT_GS_OI_YXS_GSV16_YXSV4
+    return GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(INPUT0, g, o, i, y, x);
+#elif defined INPUT0_LAYOUT_GS_OI_YXS_GSV32_YXSV4
+    return GET_FILTER_GS_OI_YXS_GSV32_YXSV4_INDEX(INPUT0, g, o, i, y, x);
 #else
 #error reorder_weights.cl: input format - not supported
 #endif
@@ -198,6 +204,14 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint
     return GET_FILTER_GS_OI_YXS_GSV4_YXSV4_INDEX(OUTPUT, g, o, i, y, x);
 #elif defined OUTPUT_LAYOUT_G_OS_IS_YX_ISV16_OSV16
     return GET_FILTER_G_OS_IS_YX_ISV16_OSV16_INDEX(OUTPUT, g, o, i, y, x, SUB_GROUP_SIZE);
+#elif defined OUTPUT_LAYOUT_OS_IS_YX_OSV16_ISV16
+    return GET_FILTER_OS_IS_YX_OSV16_ISV16_INDEX(OUTPUT, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_GS_OI_YXS_GSV16_YXSV4
+    return GET_FILTER_GS_OI_YXS_GSV16_YXSV4_INDEX(OUTPUT, g, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_GS_OI_YXS_GSV32_YXSV4
+    return GET_FILTER_GS_OI_YXS_GSV32_YXSV4_INDEX(OUTPUT, g, o, i, y, x);
+#elif defined OUTPUT_LAYOUT_G_OS_IS_YX_OSV16_ISV4
+    return GET_FILTER_G_OS_IS_YX_OSV16_ISV4_INDEX(OUTPUT, g, o, i, y, x);
 #else
 #error reorder_weights.cl: output format - not supported
 #endif
index e45c1dd..c965d04 100644 (file)
@@ -49,7 +49,37 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
 #endif
 )
 {
-#if defined(SAMPLE_TYPE_NEAREST)
+#if defined(SAMPLE_TYPE_NEAREST) && FEATURE_PACKED_MODE
+    typedef MAKE_VECTOR_TYPE(INPUT0_TYPE, PACK_SIZE) in_pack_t;
+    typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, PACK_SIZE) out_pack_t;
+
+    const int ox = get_global_id(0);
+    const int oy = get_global_id(1) % OUTPUT_SIZE_Y;
+    const int oz = get_global_id(1) / OUTPUT_SIZE_Y;
+    const int feature = (get_global_id(2) * PACK_SIZE) % OUTPUT_FEATURE_NUM;
+    const int batch = (get_global_id(2) * PACK_SIZE) / OUTPUT_FEATURE_NUM;
+    const int ix = floor(ox * X_RATIO);
+    const int iy = floor(oy * Y_RATIO);
+    const int iz = floor(oz * Z_RATIO);
+
+    uint input_idx = FUNC_CALL(get_input_index)(batch, feature, iz, iy, ix);
+    uint output_idx = FUNC_CALL(get_output_index)(batch, feature, oz, oy, ox);
+
+    in_pack_t interp_val_pack = ((const __global in_pack_t*)(input + input_idx))[0];
+    out_pack_t res;
+    unroll_for (uint pi = 0; pi < PACK_SIZE; ++pi) {
+        INPUT0_TYPE interp_val = interp_val_pack[pi];
+    #if HAS_FUSED_OPS
+        #define OF_ID (feature + pi)
+        FUSED_OPS;
+        res[pi] = FUSED_OPS_RESULT;
+    #else
+        res[pi] = ACTIVATION(interp_val, ACTIVATION_PARAMS);
+    #endif
+    }
+    ((__global out_pack_t*)(output + output_idx))[0] = res;
+
+#elif defined(SAMPLE_TYPE_NEAREST)
     const int ox = get_global_id(0);
 #if OUTPUT_DIMS <= 4
     const int oy = get_global_id(1);
@@ -79,29 +109,29 @@ KERNEL (resample_gpu_ref)(__global INPUT0_TYPE* input,
     const int oy = get_global_id(1);
     const int feature = 0;
     const int batch = get_global_id(2);
-    const INPUT0_TYPE ix = TO_INPUT0_TYPE(X_RATIO) * ox;
-    const INPUT0_TYPE iy = TO_INPUT0_TYPE(Y_RATIO) * oy;
+    const float ix = X_RATIO * ox;
+    const float iy = Y_RATIO * oy;
 
 #ifdef LEFTOVERS
     if (ox >= OUTPUT_SIZE_X)
         return;
 #endif
 
-    const int top_y_index    = (int)(floor(iy));
-    const int bottom_y_index = (int)(min(ceil(iy), TO_INPUT0_TYPE(INPUT0_SIZE_Y) - 1));
-    const int left_x_index   = (int)(floor(ix));
-    const int right_x_index  = (int)(min(ceil(ix), TO_INPUT0_TYPE(INPUT0_SIZE_X) - 1));
+    const int top_y_index = (int)(floor(iy));
+    const int bottom_y_index = (int)(min(TO_INPUT0_TYPE(ceil(iy)), TO_INPUT0_TYPE(INPUT0_SIZE_Y) - 1));
+    const int left_x_index = (int)(floor(ix));
+    const int right_x_index = (int)(min(TO_INPUT0_TYPE(ceil(ix)), TO_INPUT0_TYPE(INPUT0_SIZE_X) - 1));
 
-    const INPUT0_TYPE dx = ix - left_x_index;
-    const INPUT0_TYPE dy = iy - top_y_index;
+    const INPUT0_TYPE dx = TO_INPUT0_TYPE(ix - left_x_index);
+    const INPUT0_TYPE dy = TO_INPUT0_TYPE(iy - top_y_index);
 
-    unroll_for (int in_f = 0; in_f < OUTPUT_FEATURE_NUM; in_f++) {
-        INPUT0_TYPE top_left     = input[INPUT0_GET_INDEX(batch, in_f, top_y_index, left_x_index)];
-        INPUT0_TYPE top_right    = input[INPUT0_GET_INDEX(batch, in_f, top_y_index, right_x_index)];
-        INPUT0_TYPE bottom_left  = input[INPUT0_GET_INDEX(batch, in_f, bottom_y_index, left_x_index)];
+    unroll_for(int in_f = 0; in_f < OUTPUT_FEATURE_NUM; in_f++) {
+        INPUT0_TYPE top_left = input[INPUT0_GET_INDEX(batch, in_f, top_y_index, left_x_index)];
+        INPUT0_TYPE top_right = input[INPUT0_GET_INDEX(batch, in_f, top_y_index, right_x_index)];
+        INPUT0_TYPE bottom_left = input[INPUT0_GET_INDEX(batch, in_f, bottom_y_index, left_x_index)];
         INPUT0_TYPE bottom_right = input[INPUT0_GET_INDEX(batch, in_f, bottom_y_index, right_x_index)];
 
-        INPUT0_TYPE top    = top_left + (top_right - top_left) * dx;
+        INPUT0_TYPE top = top_left + (top_right - top_left) * dx;
         INPUT0_TYPE bottom = bottom_left + (bottom_right - bottom_left) * dx;
 
         INPUT0_TYPE interp_val = top + (bottom - top) * dy;
index d36a49a..7bbe95e 100644 (file)
@@ -431,10 +431,14 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
     } else if (_tensor.LogicalSize() == _tensor.Feature().v) {
         // We support broadcast only if corresponding dimension is equal to 1.
         // Otherwise, dimensions should be equal and using "f" should be safe.
-        if (_tensor.PitchesDifferFromLogicalDims()) {
+        if (_tensor.PitchesDifferFromLogicalDims() && _tensor.SimpleLayout()) {
             std::string f_pitch = std::to_string(_tensor.Feature().pitch);
             definitions.push_back({ safe_index_func_name, "(" + offset + " + (f) * " + f_pitch + ")" });
             definitions.push_back({ index_func_name, "(" + offset + " + (f) * " + f_pitch + ")" });
+        } else if (_tensor.PitchesDifferFromLogicalDims()) {
+            // TODO This should be solved differently, by setting the macro arguments to zero
+            definitions.push_back({ safe_index_func_name, safe_index_func_val });
+            definitions.push_back({ index_func_name, index_func_val });
         } else {
             definitions.push_back({ safe_index_func_name, "f" });
             definitions.push_back({ index_func_name, "f" });
index 41e78f0..0901ce2 100644 (file)
@@ -61,13 +61,17 @@ class OpenCL2CHeaders(object):
         res = '{{"{}",\n(std::string) R"__krnl(\n'.format(kernel_name)
         content = self.append_file_content(filename, filename)
         max_lines = 200
+        max_characters = 16350
+        characters = 1  # Newline character above
 
         for i, line in enumerate(content.split('\n')):
-            if i % max_lines == 0:
+            if (i + 1) % max_lines == 0 or characters + len(line) + 1 > max_characters:
                 res += ')__krnl"\n + R"__krnl('
+                characters = 0
             res += line + '\n'
+            characters += len(line) + 1
 
-        res += ')__krnl"}},\n\n'.format(kernel_name, self.append_file_content(filename, filename))
+        res += ')__krnl"}},\n\n'.format(kernel_name)
 
         return res
 
index 0aa29a6..3c35eea 100644 (file)
@@ -82,11 +82,10 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
     for (auto& c : conf) {
         std::string fused_ops;
         std::string fused_ops_preload;
-        std::string fused_ops_calc_only;
+        std::string fused_ops_calc;
         std::string in_name = c.input_var_name;
         Datatype in_type = c.input_dt;
-
-        bool can_use_preload = true;
+        bool can_all_use_preload = true;
 
         for (size_t i = 0; i < params.fused_ops.size(); i++) {
             auto fused_dep_codegen = FusedOpsCodeGenerator(params.fused_ops[i]);
@@ -97,20 +96,26 @@ JitConstants KernelBase::MakeFusedOpsJitConstants(const kernel_selector::base_pa
             in_name = out_var;
             in_type = out_type;
 
-            can_use_preload &= fused_dep_codegen.CanPreloadData(c);
+            bool can_use_preload = fused_dep_codegen.CanPreloadData(c);
+            can_all_use_preload &= can_use_preload;
 
             fused_ops += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
             fused_ops += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
-            fused_ops_preload += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
-            fused_ops_calc_only += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
+            if (can_use_preload)
+                fused_ops_preload += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
+            if (c.allow_for_partial_preload && !can_use_preload)
+                fused_ops_calc += "\\\n\tFUSED_OP" + std::to_string(i) + "_LOAD" + c.suffix;
+            fused_ops_calc += "\\\n\tFUSED_OP" + std::to_string(i) + "_ACTION" + c.suffix;
         }
 
         jit.AddConstant(MakeJitConstant("FUSED_OPS" + c.suffix, fused_ops));
         jit.AddConstant(MakeJitConstant("FUSED_OPS_PRELOAD" + c.suffix, fused_ops_preload));
-        jit.AddConstant(MakeJitConstant("FUSED_OPS_CALC" + c.suffix, fused_ops_calc_only));
+        jit.AddConstant(MakeJitConstant("FUSED_OPS_CALC" + c.suffix, fused_ops_calc));
         jit.AddConstant(MakeJitConstant("FUSED_OPS_RESULT" + c.suffix, in_name));
 
-        jit.AddConstant(MakeJitConstant("FUSED_OPS_CAN_USE_PRELOAD" + c.suffix, can_use_preload));
+        bool can_any_use_preload = !fused_ops_preload.empty();
+        jit.AddConstant(MakeJitConstant("FUSED_OPS_CAN_USE_PRELOAD" + c.suffix,
+            can_all_use_preload || (c.allow_for_partial_preload && can_any_use_preload)));
     }
 
     jit.Merge(MakeFusedOpsDeclsJitConstants(params, conf));
index c711cae..8283c5c 100644 (file)
@@ -117,6 +117,7 @@ std::string toString(DataLayout l) {
         case kernel_selector::DataLayout::bs_fs_yx_bsv16_fsv16:  return "BS_FS_YX_BSV16_FSV16";
         case kernel_selector::DataLayout::bs_fs_zyx_bsv16_fsv16: return "BS_FS_ZYX_BSV16_FSV16";
         case kernel_selector::DataLayout::nv12:                  return "NV12";
+        case kernel_selector::DataLayout::image_2d_rgba:         return "IMAGE_2D_RGBA";
         default:
             return "";
     }
@@ -296,7 +297,7 @@ std::string toString(MVNMode mode) {
 }
 
 std::string toString(WeightsLayout layout) {
-    switch (layout) {
+   switch (layout) {
         case WeightsLayout::oi:                                          return "OI";
         case WeightsLayout::io:                                          return "IO";
         case WeightsLayout::oiyx:                                        return "OIYX";
@@ -304,6 +305,7 @@ std::string toString(WeightsLayout layout) {
         case WeightsLayout::iyxo:                                        return "IYXO";
         case WeightsLayout::yxio:                                        return "YXIO";
         case WeightsLayout::os_is_yx_isv16_osv16:                        return "OS_IS_YX_ISV16_OSV16";
+        case WeightsLayout::os_is_yx_osv16_isv16:                        return "OS_IS_YX_OSV16_ISV16";
         case WeightsLayout::os_iyx_osv16:                                return "OS_IYX_OSV16";
         case WeightsLayout::os_iyx_osv32:                                return "OS_IYX_OSV32";
         case WeightsLayout::os_iyx_osv32__ai32:                          return "OS_IYX_OSV32__AI32";
@@ -362,7 +364,10 @@ std::string toString(WeightsLayout layout) {
         case WeightsLayout::giy_xs_os_xsv2_osv16__ao32:                  return "GIY_XS_OS_XSV2_OSV16__AO32";
         case WeightsLayout::giy_xs_os_xsv2_osv8__ao32:                   return "GIY_XS_OS_XSV2_OSV8__AO32";
         case WeightsLayout::gs_oi_yxs_gsv4_yxsv4:                        return "GS_OI_YXS_GSV4_YXSV4";
+        case WeightsLayout::gs_oi_yxs_gsv16_yxsv4:                       return "GS_OI_YXS_GSV16_YXSV4";
+        case WeightsLayout::gs_oi_yxs_gsv32_yxsv4:                       return "GS_OI_YXS_GSV32_YXSV4";
         case WeightsLayout::g_os_is_yx_isv16_osv16:                      return "G_OS_IS_YX_ISV16_OSV16";
+        case WeightsLayout::g_os_is_yx_osv16_isv4:                       return "G_OS_IS_YX_OSV16_ISV4";
         default: throw std::invalid_argument("Failed to convert WeightsLayout " + std::to_string(layout) + " to string");
     }
 }
index 7c2b1f1..92f2601 100644 (file)
@@ -386,6 +386,7 @@ void ParamsKey::EnableLookUpTableIndicesFormat(Datatype a) {
 }
 
 void ParamsKey::EnableFusedConvEltwiseRWOutOpt() { key.restrict.val.dedicated.fused_conv_eltw.rw_out_opt = 1; }
+void ParamsKey::EnableFusedConvEltwDepthToSpaceFusing() { key.restrict.val.dedicated.fused_conv_eltw.depth_to_space_fused = 1; }
 
 
 void ParamsKey::EnableQuantization(QuantizationType q) {
@@ -466,6 +467,10 @@ ParamsKey Params::GetParamsKey() const {
         k.EnableSubGroupShort();
     }
 
+    if (engineInfo.bSubGroupCharSupport) {
+        k.EnableSubGroupChar();
+    }
+
     return k;
 }
 
index 6121b50..9ac8306 100644 (file)
@@ -217,6 +217,7 @@ public:
                         uint32_t stride : 1;
                         // fused conv eltw
                         uint32_t rw_out_opt : 1;
+                        uint32_t depth_to_space_fused : 1;
                     } fused_conv_eltw;
                     struct quantize_t {
                         uint32_t packed_binary_output : 1;
@@ -231,6 +232,7 @@ public:
             struct val_t {
                 uint32_t subgroup : 1;
                 uint32_t subgroupShort : 1;
+                uint32_t subgroupChar : 1;
             } val;
             uint32_t raw;
         } machineInfo;
@@ -293,6 +295,7 @@ public:
     void EnableGradient() { key.restrict.val.gradient = 1; }
     void EnableSubGroup() { key.machineInfo.val.subgroup = 1; }
     void EnableSubGroupShort() { key.machineInfo.val.subgroupShort = 1; }
+    void EnableSubGroupChar() { key.machineInfo.val.subgroupChar = 1; }
     void EnableNonBiasTerm() { key.restrict.val.nonBias = 1; }
     void EnableBiasPerFeature() { key.restrict.val.biasPerFeatureMap = 1; }
     void EnableBiasPerOutput() { key.restrict.val.biasPerOutput = 1; }
@@ -330,6 +333,7 @@ public:
     void EnableFusedConvEltwInt8Quantization() { key.restrict.val.dedicated.fused_conv_eltw.quantization = 1; }
     void EnableFusedConvEltwOutputCalibration() { key.restrict.val.dedicated.fused_conv_eltw.calibration = 1; }
     void EnableFusedConvEltwEltwiseStride();
+    void EnableFusedConvEltwDepthToSpaceFusing();
 
     void EnableQuantizePackedBinaryOutput() { key.restrict.val.dedicated.quantize.packed_binary_output = 1; }
     void EnableQuantizeScaleShiftOpt() { key.restrict.val.dedicated.quantize.scale_shift_opt = 1; }
@@ -375,6 +379,7 @@ private:
 struct EngineInfo {
     bool bSubGroupSupport = false;
     bool bSubGroupShortSupport = false;
+    bool bSubGroupCharSupport = false;
     bool bFP16Support = false;
     bool bFP64Support = false;
     bool bImageSupport = false;
@@ -468,6 +473,9 @@ struct FusedOpsConfiguration {
     IndexType index_type;
     // Defines outer loops channels where fused op is called.
     std::vector<Tensor::DataChannelName> loop_axes;
+    // If allow_for_partial_preload is false, then it's required that all fused_ops can be preloaded.
+    // If allow_for_partial_preload is true, then not preloaded fused_ops will be loaded in FUSED_OPS_CALC.
+    bool allow_for_partial_preload;
 
     FusedOpsConfiguration(std::string suffix,
                           std::vector<std::string> bfzyx_idx_order,
@@ -478,7 +486,8 @@ struct FusedOpsConfiguration {
                           BoundaryCheck boundary_check = BoundaryCheck::ENABLED,
                           IndexType index_type = IndexType::TENSOR_COORD,
                           Tensor::DataChannelName vec_axis = Tensor::DataChannelName::COUNT,
-                          std::vector<Tensor::DataChannelName> loop_axes = {})
+                          std::vector<Tensor::DataChannelName> loop_axes = {},
+                          bool allow_for_partial_preload = false)
       : suffix(suffix)
       , bfzyx_idx_order(bfzyx_idx_order)
       , input_var_name(input_var_name)
@@ -488,14 +497,18 @@ struct FusedOpsConfiguration {
       , load_type(load_type)
       , boundary_check(boundary_check)
       , index_type(index_type)
-      , loop_axes(loop_axes) { }
+      , loop_axes(loop_axes)
+      , allow_for_partial_preload(allow_for_partial_preload) { }
 
     FusedOpsConfiguration& SetVectorSize(size_t val) { vec_size = val; return *this; }
     FusedOpsConfiguration& SetLoadType(LoadType val) { load_type = val; return *this; }
     FusedOpsConfiguration& SetBoundaryCheck(BoundaryCheck val) { boundary_check = val; return *this; }
     FusedOpsConfiguration& SetIndexType(IndexType val) { index_type = val; return *this; }
     FusedOpsConfiguration& SetVectorAxis(Tensor::DataChannelName val) { vec_axis = val; return *this; }
-    FusedOpsConfiguration& SetLoopAxes(std::vector<Tensor::DataChannelName> val) { loop_axes = std::move(val); return *this; }
+    FusedOpsConfiguration& SetLoopAxes(std::vector<Tensor::DataChannelName> val, bool partial_preload = false) {
+        loop_axes = std::move(val);
+        allow_for_partial_preload = partial_preload;
+        return *this; }
 };
 
 // Instance of fused_operation_desc is added to fused_ops vector if a node has been fused to current one using program_impl::fuse_nodes
index d2f8624..f6baa68 100644 (file)
 
 namespace cldnn {
 
-void err_details::cldnn_print_error_message(const std::string& file,
-                                            int line,
+void err_details::cldnn_print_error_message(
+#ifndef NDEBUG
+                                            const std::string& file, int line,
+#else
+                                            const std::string&, int,
+#endif
                                             const std::string& instance_id,
                                             std::stringstream& msg,
                                             const std::string& add_msg) {
@@ -31,9 +35,6 @@ void err_details::cldnn_print_error_message(const std::string& file,
 
 #ifndef NDEBUG
         source_of_error << file << " at line: " << line << std::endl;
-#else
-        (void)file;
-        (void)line;
 #endif
         source_of_error << "Error has occured for: " << instance_id << std::endl;
 
index 9d0d10f..a000e9a 100644 (file)
@@ -301,12 +301,15 @@ fused_conv_eltwise_inst::typed_primitive_inst(network_impl& network, fused_conv_
                                   "expected size of batch",
                                   1,
                                   "Biases isn't 1D vector.");
-            CLDNN_ERROR_NOT_EQUAL(node.id(),
-                                  "Bias feature[0]",
-                                  bias_inst.size.feature[0],
-                                  "expected feature map number",
-                                  output_size.feature[0] / split,
-                                  "Bias/fm mismatch");
+
+            if (node.get_output_layout().format != format::image_2d_rgba) {
+                CLDNN_ERROR_NOT_EQUAL(node.id(),
+                                      "Bias feature[0]",
+                                      bias_inst.size.feature[0],
+                                      "expected feature map number",
+                                      output_size.feature[0] / split,
+                                      "Bias/fm mismatch");
+            }
             CLDNN_ERROR_NOT_EQUAL(node.id(),
                                   "Bias spatial[1]",
                                   bias_inst.size.spatial[1],
index 8e010f6..dedf1b3 100644 (file)
@@ -147,6 +147,8 @@ attach_concatenation_gpu::attach_concatenation_gpu() {
         // block f16 format
         {std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), concatenation_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), concatenation_gpu::create},
+        {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), concatenation_gpu::create},
+        {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), concatenation_gpu::create},
         // MMAD
         {std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), concatenation_gpu::create},
         {std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), concatenation_gpu::create},
index 400b88a..0ea4398 100644 (file)
@@ -196,12 +196,17 @@ attach_convolution_gpu::attach_convolution_gpu() {
     // block f16 format
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw);
+    // block i8 format
+    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
     // MMAD
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), val_fw);
+    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw);
+    implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw);
 
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv32), val_fw);
     implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv32), val_fw);
index 895f826..8a71082 100644 (file)
@@ -246,6 +246,8 @@ attach_eltwise_gpu::attach_eltwise_gpu() {
          // block f16
          { std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), eltwise_gpu::create },
          { std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), eltwise_gpu::create },
+         { std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), eltwise_gpu::create },
+         { std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), eltwise_gpu::create },
          // 3D
          { std::make_tuple(engine_types::ocl, data_types::f32, format::bfzyx), eltwise_gpu::create },
          { std::make_tuple(engine_types::ocl, data_types::f16, format::bfzyx), eltwise_gpu::create },
index c008e62..89d592c 100644 (file)
@@ -108,6 +108,8 @@ attach_fully_connected_gpu::attach_fully_connected_gpu() {
         {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), val_fw},
         {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), val_fw},
         {std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv4), val_fw},
+        {std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw},
+        {std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw},
         // fs_b_yx_fsv32
         {std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw},
     });
index 94530d3..66bb526 100644 (file)
@@ -83,7 +83,8 @@ public:
 
         const auto transposed = arg.get_transposed();
 
-        assert(arg.get_output_layout().size.feature[0] == weights_layout.size.batch[0] * weights_layout.size.group[0]);
+        if (arg.get_fused_primitives().empty() || !(arg.get_fused_primitives().begin()->node->is_type<depth_to_space>()))
+            assert(arg.get_output_layout().size.feature[0] == weights_layout.size.batch[0] * weights_layout.size.group[0]);
 
         // conv params
         auto fused_params =
@@ -113,6 +114,7 @@ public:
 
         fused_params.non_conv_scale = primitive->non_conv_scale;
         fused_params.second_input_in_output = primitive->second_input_in_output;
+        fused_params.depth_to_space_already_fused = primitive->depth_to_space_already_fused;
 
         conv_params.local_convolution = weights_size.local[0] > 1 || weights_size.local[1] > 1;
         conv_params.split = split;
@@ -237,6 +239,8 @@ attach_fused_conv_eltwise_gpu::attach_fused_conv_eltwise_gpu() {
                                                 fused_conv_eltwise_gpu::create);
     implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf_af32),
                                                 fused_conv_eltwise_gpu::create);
+    implementation_map<fused_conv_eltwise>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::image_2d_rgba),
+        fused_conv_eltwise_gpu::create);
 }
 
 }  // namespace detail
index ec152db..a09538a 100644 (file)
@@ -104,6 +104,15 @@ gpu_image2d::gpu_image2d(const refcounted_obj_ptr<engine_impl>& engine, const la
             _height = layout.size.spatial[0] * layout.size.feature[0] * layout.size.spatial[1];
             order = CL_RGBA;
             break;
+        case format::image_2d_rgba:
+            _width = layout.size.spatial[0];
+            _height = layout.size.spatial[1];
+            order = CL_RGBA;
+            if (layout.size.feature[0] != 3 && layout.size.feature[0] != 4) {
+                CLDNN_ERROR_MESSAGE("2D image allocation", "invalid number of channels in image_2d_rgba input image (should be 3 or 4)!");
+            }
+            type = CL_UNORM_INT8;
+            break;
         case format::nv12:
             _width = layout.size.spatial[1];
             _height = layout.size.spatial[0];
@@ -189,7 +198,7 @@ gpu_media_buffer::gpu_media_buffer(const refcounted_obj_ptr<engine_impl>& engine
     const shared_mem_params* params,
     uint32_t net_id)
     : gpu_image2d(engine, new_layout,
-        cl::ImageVA(engine->get_context()->context(), CL_MEM_READ_ONLY,
+        cl::ImageVA(engine->get_context()->context(), CL_MEM_READ_WRITE,
                     params->surface, params->plane),
         net_id),
     device(params->user_device),
index c6baa9f..4661045 100644 (file)
@@ -92,6 +92,14 @@ attach_mvn_gpu::attach_mvn_gpu() {
                                  mvn_gpu::create);
     implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16),
                                  mvn_gpu::create);
+    implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16),
+                                 mvn_gpu::create);
+    implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16),
+                                 mvn_gpu::create);
+    implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16),
+                                 mvn_gpu::create);
+    implementation_map<mvn>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16),
+                                 mvn_gpu::create);
 }
 
 }  // namespace detail
index 0ca759a..d2e0802 100644 (file)
@@ -178,9 +178,10 @@ attach_pooling_gpu::attach_pooling_gpu() {
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), pooling_gpu::create);
-    // block fsv16 format
+    // block fp16 format
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), pooling_gpu::create);
+    // block i8 format
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), pooling_gpu::create);
     // 3D
index ca5bf44..ae38bdd 100644 (file)
@@ -89,6 +89,8 @@ attach_quantize_gpu::attach_quantize_gpu() {
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
 
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf_af32), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf_af32), val_fw);
index a7f61cf..ff39eba 100644 (file)
@@ -100,6 +100,9 @@ attach_scale_gpu::attach_scale_gpu() {
 
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw);
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv16), val_fw);
+    implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
+
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_zyx_fsv16), val_fw);
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv16), val_fw);
     implementation_map<scale>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), val_fw);
index 6cc6040..aca26f1 100644 (file)
@@ -58,7 +58,8 @@ public:
         params.shrink_axis_mask = arg.get_primitive()->shrink_axis_mask;
         pad_vector_to_size(params.shrink_axis_mask, dims_num, 0);
 
-        std::vector<size_t> logical_dims = params.output.LogicalDims();
+        std::vector<size_t> logical_dims = params.inputs[0].LogicalDims();
+        std::reverse(logical_dims.begin(), logical_dims.end());  // get dims in bfyx order
         std::vector<int32_t> out_shape;
         for (const auto& dim : logical_dims)
             out_shape.push_back(static_cast<int32_t>(dim));
@@ -68,6 +69,16 @@ public:
         // instead.
         vector_assign_if_not_mask(params.striding_params[1], out_shape, params.end_mask);
 
+        for (size_t dim = 0; dim < params.striding_params[2].size(); dim++) {
+            auto begin = params.striding_params[0][dim] < 0 ? out_shape[dim] + params.striding_params[0][dim] : params.striding_params[0][dim];
+            auto end = params.striding_params[1][dim] < 0 ? out_shape[dim] + params.striding_params[1][dim] : params.striding_params[1][dim];
+            auto stride = params.striding_params[2][dim];
+            if (stride < 0 && (end > begin)) {
+                std::swap(params.striding_params[0][dim], params.striding_params[1][dim]);
+                params.striding_params[0][dim] = params.striding_params[0][dim] - 1;
+            }
+        }
+
         auto& kernel_selector = kernel_selector::strided_slice_kernel_selector::Instance();
         auto best_kernels = kernel_selector.GetBestKernels(params, op_params);
 
index 2a6c099..55e6b92 100644 (file)
 #include "lstm_inst.h"
 #include "reshape_inst.h"
 #include "resample_inst.h"
+#include "permute_inst.h"
+#include "depth_to_space_inst.h"
 #include "lstm_dynamic_inst.h"
 #include "lstm_dynamic_input_inst.h"
 #include "lstm_dynamic_timeloop_inst.h"
 #include "mutable_data_inst.h"
 #include "arg_max_min_inst.h"
+#include "kernel_selector_utils.h"
 
 #include <iomanip>
 #include <string>
@@ -114,6 +117,65 @@ void graph_initializations::replace_nodes(program_impl& p) {
             p.nodes_map.erase(node->id());
             continue;
         }
+
+        // find sequence reshape->permute->reshape and exchange with depth to space
+        if (node->is_type<reshape>()) {
+            if (!p.get_options().get<build_option_type::optimize_data>()->enabled())
+                continue;
+
+            if (node->get_users().size() == 0)
+                continue;
+
+            auto& input_node = node->get_dependency(0);
+            if (!(node->get_users().front()->is_type<permute>()) || !(input_node.is_type<reorder>()))
+                continue;
+
+            auto input_node_layout = input_node.get_output_layout();
+            if (input_node_layout.format != format::bfwzyx || input_node_layout.data_type != data_types::f16)
+                continue;
+
+            // optimal implementation only for depth to space block size 2
+            auto reshape1_layout = node->get_output_layout();
+            if (reshape1_layout.size.spatial[3] != 2)
+                continue;
+
+            auto permute_prim = node->get_users().front()->as<permute>().typed_desc();
+            primitive_id permute_id = node->get_users().front()->id();
+            auto& permute_node = node->get_users().front();
+
+            auto reshape1_prim = node->as<reshape>().typed_desc();
+            primitive_id reshape1_id = node->id();
+
+            p.remove_connection(*node, *permute_node);
+
+            auto perm_node_ptr = p.nodes_map.find(permute_id)->second;
+            auto perm_node = &perm_node_ptr->as<permute>();
+
+            auto rename_id = permute_id + "_tmp";
+            p.rename(*perm_node, rename_id);
+
+            auto reorder_id = input_node.id() + "_reorder_for_depth_to_space";
+            auto reorder_prim = std::make_shared<reorder>(reorder_id, input_node.id(), format::bfyx, input_node_layout.data_type);
+            auto pixel_shuffle_prim = std::make_shared<depth_to_space>(permute_id, reorder_id, 2);
+
+            p.get_or_create(reorder_prim);
+            p.get_or_create(pixel_shuffle_prim);
+            auto reorder_depth_node_ptr = p.nodes_map.find(reorder_id)->second;
+            auto pixel_shuffle_node_ptr = p.nodes_map.find(permute_id)->second;
+            p.add_connection(input_node, *reorder_depth_node_ptr);
+            p.add_connection(*reorder_depth_node_ptr, *pixel_shuffle_node_ptr);
+
+            auto deconv_node_ptr = p.nodes_map.find(rename_id)->second;
+            p.replace_all_usages(*deconv_node_ptr, *pixel_shuffle_node_ptr);
+            p.optimized_out.push_back(rename_id);
+            p.nodes_map.erase(rename_id);
+
+            p.remove_connection(input_node, *node);
+            p.replace_all_usages(*node, input_node);
+            p.optimized_out.push_back(reshape1_id);
+            p.nodes_map.erase(reshape1_id);
+            continue;
+        }
     }
 }
 
index 1e76538..59ca913 100644 (file)
@@ -21,6 +21,8 @@
 
 #include "convolution_inst.h"
 #include "deconvolution_inst.h"
+#include "depth_to_space_inst.h"
+#include "kernel_selector_utils.h"
 #include <vector>
 #include <list>
 #include <memory>
@@ -40,132 +42,291 @@ void pre_replace_deconv::run(program_impl& p) {
                 continue;
 
             auto deconv_prim = node->as<deconvolution>().typed_desc();
-
-            // limit optimization to stride = 1
-            if (deconv_prim->stride.spatial[0] != 1 || deconv_prim->stride.spatial[1] != 1 || deconv_prim->gradient())
-                continue;
-
-            primitive_id deconv_id = node->id();
-            auto& input_node = node->get_dependency(0);
-
-            // disable for 5D
-            if (cldnn::format::dimension(input_node.get_output_layout().format) == 5)
-                continue;
-
-            // Disable for blocked formats
-            if ((_lo.get_optimization_attributes().b_fs_yx_fsv16_network || input_node.get_output_layout().format == format::b_fs_yx_fsv16) &&
-                _lo.is_format_optimized(node->as<deconvolution>(), format::b_fs_yx_fsv16)) {
-                continue;
-            }
-
-
-            primitive_id input_id = deconv_prim->input[0];
-
-            // setting convolution parameters based on deconvolution params
-            auto stride = deconv_prim->stride;
+            tensor filter_size = { 1, 1, 1, 1, 1 };
             auto weights = deconv_prim->weights;
+
             std::vector<primitive_id> weights_vec;
-            for (auto& weights_id : weights) weights_vec.push_back(weights_id);
-            auto biases = deconv_prim->bias;
-            std::vector<primitive_id> bias_vec;
-            for (auto& bias_id : biases) bias_vec.push_back(bias_id);
-            auto input_offset = deconv_prim->input_offset;
-            auto output_padding = deconv_prim->output_padding;
-
-            // remove deconvolution node and its connections to weights and biases, rename it and move to the optimized
-            // list
-            tensor filter_size = {1, 1, 1, 1, 1};
-            p.remove_connection(node->get_dependency(0), *node);
+            for (auto& weights_id : weights)
+                weights_vec.push_back(weights_id);
+
             for (auto& weights_id : weights_vec) {
                 auto weights_iter = p.nodes_map.find(weights_id);
                 if (weights_iter == p.nodes_map.end())  continue;
 
                 auto weights_node_ptr = weights_iter->second;
-                p.remove_connection(*weights_node_ptr, *node);
-                // get filter spatial sizes for input offset adjustment, perform this only once as all filters shouls
+                // get filter spatial sizes for input offset adjustment, perform this only once as all filters should
                 // have same size
                 if (weights_id == weights_vec[0])
                     filter_size = weights_node_ptr->get_output_layout().size;
             }
 
-            input_offset.spatial[0] = std::abs(input_offset.spatial[0]) - (filter_size.spatial[0] - 1);
-            input_offset.spatial[1] = std::abs(input_offset.spatial[1]) - (filter_size.spatial[1] - 1);
-            input_offset.spatial[2] = std::abs(input_offset.spatial[2]) - (filter_size.spatial[2] - 1);
+            // limit optimization to stride = 1
+            if (deconv_prim->stride.spatial[0] == 1 && deconv_prim->stride.spatial[1] == 1 && !deconv_prim->gradient()) {
+                primitive_id deconv_id = node->id();
+                auto& input_node = node->get_dependency(0);
 
-            if (!bias_vec.empty()) {
-                for (auto& bias_id : bias_vec) {
-                    auto bias_iter = p.nodes_map.find(bias_id);
-                    if (bias_iter == p.nodes_map.end())  continue;
+                // disable for 5D
+                if (cldnn::format::dimension(input_node.get_output_layout().format) == 5)
+                    continue;
 
-                    auto bias_id_node_ptr = bias_iter->second;
-                    p.remove_connection(*bias_id_node_ptr, *node);
+                // Disable for blocked formats
+                if ((_lo.get_optimization_attributes().b_fs_yx_fsv16_network || input_node.get_output_layout().format == format::b_fs_yx_fsv16) &&
+                    _lo.is_format_optimized(node->as<deconvolution>(), format::b_fs_yx_fsv16)) {
+                    continue;
                 }
-            }
-            auto rename_id = deconv_id + "_tmp";
-            p.rename(*node, rename_id);
-
-            // create convolution primitive
-            if (biases.size() != 0) {
-                auto conv_prim = std::make_shared<convolution>(deconv_id,
-                                                               input_id,
-                                                               weights_vec,
-                                                               bias_vec,
-                                                               stride,
-                                                               input_offset,
-                                                               tensor{1, 1, 1, 1},
-                                                               output_padding);
-                p.get_or_create(conv_prim);
-            } else {
-                auto conv_prim = std::make_shared<convolution>(deconv_id,
-                                                               input_id,
-                                                               weights_vec,
-                                                               stride,
-                                                               input_offset,
-                                                               tensor{1, 1, 1, 1},
-                                                               output_padding);
-                p.get_or_create(conv_prim);
-            }
 
-            auto conv_node_itr = p.nodes_map.find(deconv_id);
-            if (conv_node_itr == p.nodes_map.end()) continue;
+                primitive_id input_id = deconv_prim->input[0];
 
-            auto conv_node_ptr = conv_node_itr->second;
-            auto conv_node = &conv_node_ptr->as<convolution>();
-            conv_node->set_transposed(true);
+                // setting convolution parameters based on deconvolution params
+                auto stride = deconv_prim->stride;
+                auto biases = deconv_prim->bias;
+                std::vector<primitive_id> bias_vec;
+                for (auto& bias_id : biases) bias_vec.push_back(bias_id);
+                auto input_offset = deconv_prim->input_offset;
+                auto output_padding = deconv_prim->output_padding;
 
-            // add connections input->convolution, weights->convolution and bias->convolution
-            p.add_connection(input_node, *conv_node_ptr);
+                // remove deconvolution node and its connections to weights and biases, rename it and move to the optimized
+                // list
+                p.remove_connection(node->get_dependency(0), *node);
+                for (auto& weights_id : weights_vec) {
+                    auto weights_iter = p.nodes_map.find(weights_id);
+                    if (weights_iter == p.nodes_map.end())  continue;
 
-            for (auto& weights_id : weights_vec) {
-                auto weights_node_itr = p.nodes_map.find(weights_id);
-                if (weights_node_itr == p.nodes_map.end()) continue;
+                    auto weights_node_ptr = weights_iter->second;
+                    p.remove_connection(*weights_node_ptr, *node);
+                }
 
-                auto weights_node_ptr = weights_node_itr->second;
-                p.add_connection(*weights_node_ptr, *conv_node_ptr);
-            }
+                input_offset.spatial[0] = std::abs(input_offset.spatial[0]) - (filter_size.spatial[0] - 1);
+                input_offset.spatial[1] = std::abs(input_offset.spatial[1]) - (filter_size.spatial[1] - 1);
+                input_offset.spatial[2] = std::abs(input_offset.spatial[2]) - (filter_size.spatial[2] - 1);
 
-            if (!bias_vec.empty()) {
-                for (auto& bias_id : bias_vec) {
-                    auto bias_id_node_itr = p.nodes_map.find(bias_id);
-                    if (bias_id_node_itr == p.nodes_map.end()) continue;
+                if (!bias_vec.empty()) {
+                    for (auto& bias_id : bias_vec) {
+                        auto bias_iter = p.nodes_map.find(bias_id);
+                        if (bias_iter == p.nodes_map.end())  continue;
 
-                    auto bias_id_node_ptr = bias_id_node_itr->second;
-                    p.add_connection(*bias_id_node_ptr, *conv_node_ptr);
+                        auto bias_id_node_ptr = bias_iter->second;
+                        p.remove_connection(*bias_id_node_ptr, *node);
+                    }
                 }
-            }
+                auto rename_id = deconv_id + "_tmp";
+                p.rename(*node, rename_id);
 
-            auto deconv_node_itr = p.nodes_map.find(rename_id);
-            if (deconv_node_itr != p.nodes_map.end()) {
-                auto deconv_node_ptr = deconv_node_itr->second;
-                p.replace_all_usages(*deconv_node_ptr, *conv_node_ptr);
-                p.optimized_out.push_back(rename_id);
-                p.nodes_map.erase(rename_id);
-            }
+                // create convolution primitive
+                if (biases.size() != 0) {
+                    auto conv_prim = std::make_shared<convolution>(deconv_id,
+                        input_id,
+                        weights_vec,
+                        bias_vec,
+                        stride,
+                        input_offset,
+                        tensor{ 1, 1, 1, 1 },
+                        output_padding);
+                    p.get_or_create(conv_prim);
+                } else {
+                    auto conv_prim = std::make_shared<convolution>(deconv_id,
+                        input_id,
+                        weights_vec,
+                        stride,
+                        input_offset,
+                        tensor{ 1, 1, 1, 1 },
+                        output_padding);
+                    p.get_or_create(conv_prim);
+                }
+
+                auto conv_node_itr = p.nodes_map.find(deconv_id);
+                if (conv_node_itr == p.nodes_map.end()) continue;
+
+                auto conv_node_ptr = conv_node_itr->second;
+                auto conv_node = &conv_node_ptr->as<convolution>();
+                conv_node->set_transposed(true);
+
+                // add connections input->convolution, weights->convolution and bias->convolution
+                p.add_connection(input_node, *conv_node_ptr);
+
+                for (auto& weights_id : weights_vec) {
+                    auto weights_node_itr = p.nodes_map.find(weights_id);
+                    if (weights_node_itr == p.nodes_map.end()) continue;
+
+                    auto weights_node_ptr = weights_node_itr->second;
+                    p.add_connection(*weights_node_ptr, *conv_node_ptr);
+                }
+
+                if (!bias_vec.empty()) {
+                    for (auto& bias_id : bias_vec) {
+                        auto bias_id_node_itr = p.nodes_map.find(bias_id);
+                        if (bias_id_node_itr == p.nodes_map.end()) continue;
+
+                        auto bias_id_node_ptr = bias_id_node_itr->second;
+                        p.add_connection(*bias_id_node_ptr, *conv_node_ptr);
+                    }
+                }
+
+                auto deconv_node_itr = p.nodes_map.find(rename_id);
+                if (deconv_node_itr != p.nodes_map.end()) {
+                    auto deconv_node_ptr = deconv_node_itr->second;
+                    p.replace_all_usages(*deconv_node_ptr, *conv_node_ptr);
+                    p.optimized_out.push_back(rename_id);
+                    p.nodes_map.erase(rename_id);
+                }
 
-            p.mark_if_data_flow(*conv_node);
-            conv_node->recalc_output_layout(true);
+                update_processing_order = true;
 
-            update_processing_order = true;
+
+                p.mark_if_data_flow(*conv_node);
+                conv_node->recalc_output_layout(true);
+
+                update_processing_order = true;
+            // current optimization only available for specific deconvolution parameters
+            } else if (node->is_output() == false &&
+               node->get_output_layout().size.feature[0] == 1 &&
+               deconv_prim->stride.spatial[0] == 2 && deconv_prim->stride.spatial[1] == 2 &&
+               filter_size.spatial[0] == 9 && filter_size.spatial[1] == 9 &&
+               deconv_prim->input_offset.spatial[0] == -4 && deconv_prim->input_offset.spatial[1] == -4 &&
+               weights_vec.size() == 1 && deconv_prim->bias.size() == 1 &&
+               node->get_dependency(0).get_output_layout().format == format::bfyx &&
+               !deconv_prim->gradient()) {
+                primitive_id deconv_id = node->id();
+                auto& input_node = node->get_dependency(0);
+                primitive_id input_id = deconv_prim->input[0];
+
+                auto scale_factor = deconv_prim->stride.spatial[0];
+
+                auto cur_weights_node_ptr = p.nodes_map.find(weights_vec[0])->second;
+                auto weights_layout = cur_weights_node_ptr->get_output_layout();
+                auto weights_data_type = weights_layout.data_type;
+
+                auto biases = deconv_prim->bias[0];
+                auto bias_id_node_ptr = p.nodes_map.find(biases)->second;
+                auto bias_data_type = bias_id_node_ptr->get_output_layout().data_type;
+
+                // enable only for fp32 and fp16
+                if (weights_data_type != data_types::f16 &&
+                    weights_data_type != data_types::f32 &&
+                    bias_data_type != data_types::f16 &&
+                    bias_data_type != data_types::f32)
+                    continue;
+
+                // setting convolution parameters based on deconvolution params
+                tensor stride = { 1, 1, 1, 1 };
+                tensor input_offset = { 0, 0, -scale_factor, -scale_factor };
+                auto output_padding = deconv_prim->output_padding;
+
+                // remove deconvolution node and its connections to weights and biases,
+                // rename it and move to the optimized list
+                p.remove_connection(node->get_dependency(0), *node);
+
+                auto weights_node_ptr = p.nodes_map.find(weights_vec[0])->second;
+                p.remove_connection(*weights_node_ptr, *node);
+                p.remove_connection(*bias_id_node_ptr, *node);
+
+                auto rename_id = deconv_id + "_tmp";
+                p.rename(*node, rename_id);
+
+                // reshape weights
+                int pixel_shuffle_size = scale_factor * scale_factor;
+                int kernel_size = 5;
+                tensor target_weights_size = { pixel_shuffle_size, filter_size.feature[0], kernel_size, kernel_size };
+                auto target_weights_layout = layout{ weights_layout.data_type, weights_layout.format, target_weights_size };
+
+                {
+                     memory_impl::ptr data_to_allocate = p.get_engine().allocate_memory(target_weights_layout, 0);
+
+                     std::vector<float> weights_vec_float;
+
+                     if (weights_data_type == data_types::f16) {
+                         mem_lock<half_t> src{ cur_weights_node_ptr->as<data>().get_attached_memory() };
+                         for (uint32_t i = 0; i < weights_layout.size.count(); i++)
+                             weights_vec_float.push_back(static_cast<float>(src.data()[i]));
+                     } else {
+                         mem_lock<float> src{ cur_weights_node_ptr->as<data>().get_attached_memory() };
+                         for (uint32_t i = 0; i < weights_layout.size.count(); i++)
+                             weights_vec_float.push_back(src.data()[i]);
+                     }
+
+                     std::vector<std::vector<std::vector<float> > > subpixel_weights(pixel_shuffle_size);
+
+                     program_helpers::reshape_deconvolution_weights(weights_vec_float,
+                         static_cast<int>(filter_size.feature[0]),
+                         static_cast<int>(filter_size.spatial[0]),
+                         static_cast<int>(filter_size.spatial[1]),
+                         scale_factor,
+                         subpixel_weights);
+
+                     if (weights_data_type == data_types::f16) {
+                         mem_lock<half_t> dst{ data_to_allocate };
+                         program_helpers::set_weights_values<half_t>(dst.data(), subpixel_weights);
+                     } else if (weights_data_type == data_types::f32) {
+                         mem_lock<float> dst{ data_to_allocate };
+                         program_helpers::set_weights_values<float>(dst.data(), subpixel_weights);
+                     } else {
+                         throw std::logic_error("Not supported data type.");
+                     }
+
+                     memory api_memory = memory(data_to_allocate.detach());
+                     auto data_node_weights_replace = std::make_shared<data>(weights_vec[0] + "_conv_rpl", api_memory);
+                     p.get_or_create(data_node_weights_replace);
+                     auto data_node_weights_replace_node_ptr = p.nodes_map.find(weights_vec[0] + "_conv_rpl")->second;
+                     auto& data_node = data_node_weights_replace_node_ptr->as<data>();
+                     data_node.set_output_layout(target_weights_layout, false);
+                }
+                float bias = 0;
+
+                if (bias_data_type == data_types::f16) {
+                    mem_lock<half_t> src{ bias_id_node_ptr->as<data>().get_attached_memory() };
+                    bias = static_cast<float>(src.data()[0]);
+                } else {
+                    mem_lock<float> src{ bias_id_node_ptr->as<data>().get_attached_memory() };
+                    bias = src.data()[0];
+                }
+
+                auto deconv_id_conv = deconv_id + "_conv";
+
+                // create convolution primitive
+                auto conv_prim = std::make_shared<convolution>(deconv_id_conv,
+                    input_id,
+                    std::vector<primitive_id>{ weights_vec[0] + "_conv_rpl" },
+                    stride,
+                    input_offset,
+                    tensor{ 1, 1, 1, 1 },
+                    output_padding);
+                p.get_or_create(conv_prim);
+
+                auto conv_node_itr = p.nodes_map.find(deconv_id_conv);
+                if (conv_node_itr == p.nodes_map.end()) continue;
+
+                auto conv_node_ptr = conv_node_itr->second;
+                auto conv_node = &conv_node_ptr->as<convolution>();
+
+                // add connections input->convolution, weights->convolution and bias->convolution
+                p.add_connection(input_node, *conv_node_ptr);
+
+                {
+                    auto weights_node_conv_rpl_ptr = p.nodes_map.find(weights_vec[0] + "_conv_rpl")->second;
+                    p.add_connection(*weights_node_conv_rpl_ptr, *conv_node_ptr);
+                    p.inputs.push_back(weights_node_conv_rpl_ptr.get());
+                }
+
+                auto pixel_shuffle_prim = std::make_shared<depth_to_space>(deconv_id, deconv_id_conv, 2);
+
+                p.get_or_create(pixel_shuffle_prim);
+                auto pixel_shuffle_node_ptr = p.nodes_map.find(deconv_id)->second;
+                pixel_shuffle_node_ptr->add_fused_activation(activation_func::linear, { 1, bias });
+
+                // add connections input->convolution, weights->convolution
+                p.add_connection(*conv_node_ptr, *pixel_shuffle_node_ptr);
+
+                auto deconv_node_ptr = p.nodes_map.find(rename_id);
+                if (deconv_node_ptr != p.nodes_map.end()) {
+                    p.replace_all_usages(*deconv_node_ptr->second, *pixel_shuffle_node_ptr);
+                    p.optimized_out.push_back(rename_id);
+                    p.nodes_map.erase(rename_id);
+                }
+                p.mark_if_data_flow(*conv_node);
+                conv_node->recalc_output_layout(true);
+
+                update_processing_order = true;
+            }
         }
     }
 
index 397d65c..6425e7e 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2018-2019 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include "api/eltwise.hpp"
 #include "api/pooling.hpp"
+#include "fused_conv_eltwise_inst.h"
 #include "primitive_inst.h"
 #include "activation_inst.h"
 #include "concatenation_inst.h"
@@ -25,6 +26,7 @@
 #include "eltwise_inst.h"
 #include "reshape_inst.h"
 #include "scale_inst.h"
+#include "depth_to_space_inst.h"
 
 #include "pass_manager.h"
 #include "program_helpers.h"
@@ -161,6 +163,35 @@ void prepare_buffer_fusing::run(program_impl& p) {
                     lower_padd_in_axis += input->get_output_layout().size.raw[concat_axis];
                 }
 
+                // check if it is worth doing concat in place, in case the following primitive is convolution
+                // with different input padding than concatenation's input users' convolutions,
+                // it is likely that convolution's implementation will be a reference one, due to mismatched padding
+                // and performance gain by doing in place concat is nullified by slower convolution implementation
+                // this should be handled by more advanced tuning mechanism on the topology level
+                auto& users = node.get_users();
+                if (users.size() == 1) {
+                    auto& user = users.front();
+                    if (node.get_output_layout().format == format::bfyx && user->type() == convolution::type_id()) {
+                        auto out_input_offsets = user->as<convolution>().get_primitive()->input_offset;
+
+                        std::vector<tensor> in_input_offsets;
+                        for (auto& in_user : nodes_list.first) {
+                            if (in_user->type() == convolution::type_id())
+                                in_input_offsets.push_back(in_user->as<convolution>().get_primitive()->input_offset);
+                        }
+
+                        for (auto& in_input_offset : in_input_offsets) {
+                            if (in_input_offset.spatial[0] != out_input_offsets.spatial[0] &&
+                                in_input_offset.spatial[1] != out_input_offsets.spatial[1])
+                                return;
+                        }
+                    } else if (user->type() == fused_conv_eltwise::type_id()) {
+                        if (!user->as<fused_conv_eltwise>().get_fused_primitives().empty() &&
+                            user->as<fused_conv_eltwise>().get_fused_primitives().begin()->node->is_type<depth_to_space>())
+                            return;
+                    }
+                }
+
                 // apply concatenation in place optimization
                 for (auto input : nodes_list.first) {
                     auto input_lenght = input->get_output_layout().size.raw[concat_axis];
index b7351db..5bd635e 100644 (file)
@@ -515,12 +515,21 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
             int p1_pnum = p.get_processing_order().get_processing_number(parents[fused_idx]);
             int p2_pnum = p.get_processing_order().get_processing_number(parents[peer_idx]);
 
-            if (p1_pnum < p2_pnum && can_fuse_parents[peer_idx]) {
+            auto p1_dt = parents[fused_idx]->get_output_layout().data_type;
+            auto p2_dt = parents[peer_idx]->get_output_layout().data_type;
+
+            if (can_fuse_parents[peer_idx] &&
+               ((p1_pnum < p2_pnum && p1_dt == p2_dt) || (data_type_traits::is_floating_point(p2_dt) && !data_type_traits::is_floating_point(p1_dt)))) {
+                // Swap in 2 cases:
+                // 1. Both branches have same data type. Select branch with lower processing number
+                // 2. Peer node has fp32 output type, but fused node - int8. In that case we have to fuse to the branch
+                // with fp32 out type to avoid fp32 blobs in the quantized graph.
                 std::swap(fused_idx, peer_idx);
             }
 
             auto fused_node = parents[fused_idx];
             auto peer_node = parents[peer_idx];
+
             if (parent1->is_type<convolution>() && !conv_supports_fusings(parent1->as<convolution>()))
                 return;
 
@@ -558,6 +567,33 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
         p.get_processing_order().calc_processing_order(p);
 }
 
+void prepare_conv_eltw_fusing::fuse_conv_depth_to_space(program_impl& p, program_node* node) {
+    // make sure this convolution have only 1 user and it's depth_to_space
+    // make sure convolution is not an output
+    if (node->get_users().size() != 1 || node->is_output())
+        return;
+
+    if (!node->get_users().front()->is_type<depth_to_space>())
+        return;
+
+    convolution_node* conv_node = static_cast<convolution_node*>(node);
+
+    depth_to_space_node* d_t_s_node = static_cast<depth_to_space_node*>(node->users.front());
+    if (d_t_s_node->get_users().empty())
+        return;
+    if (!d_t_s_node->get_users().front()->is_type<eltwise>())
+        return;
+
+    for (auto& dep : d_t_s_node->get_dependencies()) {
+        format fmt = dep->get_output_layout().format;
+        data_types dep_dt = dep->get_output_layout().data_type;
+        if ((fmt != format::bfyx || dep_dt != data_types::f16))
+            return;
+    }
+
+    p.fuse_nodes(*conv_node, *d_t_s_node);
+}
+
 void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node* node) {
     // make sure this convolution have only 1 user and it's eltwise
     // make sure convolution is not an output
@@ -570,6 +606,10 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
     convolution_node* conv_node = static_cast<convolution_node*>(node);
     convolution& conv = const_cast<convolution&>(*conv_node->get_primitive());
 
+    bool if_already_depth_to_space_fused = false;
+    if (!conv_node->get_fused_primitives().empty())
+        if_already_depth_to_space_fused = conv_node->get_fused_primitives().begin()->node->is_type<depth_to_space>();
+
     // TODO: find a better way to check for available kernels
     // currently works only for these formats
     data_types data_type = conv_node->get_output_layout().data_type;
@@ -583,7 +623,8 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
             (fmt != format::byxf_af32 || dep_dt != data_types::i8) &&
             (fmt != format::byxf_af32 || dep_dt != data_types::u8) &&
             (fmt != format::bfyx || dep_dt != data_types::f32) && (fmt != format::bfyx || dep_dt != data_types::u8) &&
-            (fmt != format::bfyx || dep_dt != data_types::i8) && (fmt != format::yxfb || dep_dt != data_types::f16))
+            (fmt != format::bfyx || dep_dt != data_types::i8) && (fmt != format::yxfb || dep_dt != data_types::f16) &&
+            (fmt != format::bfyx || dep_dt != data_types::f16 || !if_already_depth_to_space_fused))
             return;
     }
 
@@ -597,7 +638,7 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
         if (filter_size.spatial[0] == 1 && filter_size.spatial[1] == 1) {
             if (conv.stride.spatial[0] != 1 || conv.stride.spatial[1] != 1)
                 return;
-        } else {
+        } else if (!if_already_depth_to_space_fused) {
             return;
         }
     }
@@ -614,7 +655,7 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
 
     // make sure eltwise have only 2 inputs
     // make sure eltwise is not an output
-    if (eltw_node->inputs_count() != 2 || eltw_node->is_output())
+    if (!if_already_depth_to_space_fused && (eltw_node->inputs_count() != 2 || eltw_node->is_output()))
         return;
 
     // only single ADD operation is currently supported
@@ -638,6 +679,13 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
     if (eltw_node->input(eltw_fused_input_idx).id() != conv.id)
         return;
 
+    auto fused_output_layout_size = eltw_node->input(eltw_second_input_idx).get_output_layout().size;
+    auto conv_output_layout_size = conv_node->get_output_layout().size;
+
+    if (fused_output_layout_size.spatial[0] * fused_output_layout_size.spatial[1] * fused_output_layout_size.feature[0] * fused_output_layout_size.batch[0]
+        != conv_output_layout_size.spatial[0] * conv_output_layout_size.spatial[1] * conv_output_layout_size.feature[0] * conv_output_layout_size.batch[0])
+        return;
+
     // get strides for other than our conv input
     std::vector<tensor> new_eltw_strides;
     // conv strides modified by eltwise stride
@@ -694,6 +742,8 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
     // Copy output data type from eltwise
     fused_conv_eltw->output_data_type = eltw_node->get_output_layout().data_type;
 
+    fused_conv_eltw->depth_to_space_already_fused = if_already_depth_to_space_fused;
+
     auto& new_node = p.get_or_create(fused_conv_eltw);
 
     for (size_t i = 0; i < eltw_node->get_fused_activations_funcs().size(); i++)
@@ -733,9 +783,30 @@ void prepare_conv_eltw_fusing::fuse_conv_eltwise(program_impl& p, program_node*
 
     new_node.dependencies = updated_deps;
 
+    if (if_already_depth_to_space_fused) {
+        new_node.add_fused_primitives(conv_node->get_fused_primitives());
+    }
+
     // Extract convolution node - will replace its usage in fused with input
     p.extract_and_remove(*conv_node);
-    new_node.recalc_output_layout();
+
+    // To change convolution's output to image type, make sure that it is the last primitive in the topology,
+    // or only reorder is afterwards and it is network's output
+    auto reorder_user = (new_node.get_users().size() == 1);
+    if (reorder_user)
+        reorder_user &= ((new_node.get_users().front()->is_type<reorder>()) && (new_node.get_users().front()->is_output()));
+    if (if_already_depth_to_space_fused && (new_node.get_users().size() == 0 || reorder_user)) {
+        cldnn::layout new_layout = { data_types::u8, format::image_2d_rgba, fused_output_layout_size };
+        new_node.set_output_layout(new_layout);
+        // Remove output reorder if present
+        if (reorder_user) {
+            auto& reorder_node = new_node.get_users().front();
+            reorder_node->remove_dependency(1);
+            p.extract_and_remove(*reorder_node);
+        }
+    } else {
+        new_node.recalc_output_layout();
+    }
 
     p.add_optimized_primitive_info(conv_id, {new_node.id()});
     p.add_optimized_primitive_info(eltw_id, {new_node.id()});
@@ -763,6 +834,8 @@ void prepare_conv_eltw_fusing::run(program_impl& p) {
 
         auto& node = (*node_itr);
 
+        fuse_conv_depth_to_space(p, node);
+
         fuse_conv_eltwise(p, node);
     }
 }
index 4c62624..bb81b6d 100644 (file)
@@ -434,9 +434,6 @@ void prepare_quantization::prepare_asymmetric_quantization(program_impl &p) {
         auto node_itr = itr++;
         auto& node = (*node_itr);
 
-        if (node->is_output())
-            continue;
-
         // Detects if given eltwise node performs zero point subtraction
         auto is_zero_point_node = [](eltwise_node& node) -> bool {
             auto prim = node.get_primitive();
@@ -660,7 +657,7 @@ void prepare_quantization::prepare_asymmetric_quantization(program_impl &p) {
             // Remove sub operations from the graph and set correct users for zero points and inputs
             if (asymmetric_data) {
                 if (!new_a_zp || !new_input)
-                    CLDNN_ERROR_MESSAGE(convolution_node.id(), "Unexpected nullptr in asymmetric quantization for activations optimization");
+                    CLDNN_ERROR_MESSAGE(new_conv_node.id(), "Unexpected nullptr in asymmetric quantization for activations optimization");
 
                 auto& zp_users = new_a_zp->users;
                 auto& in_users = new_input->users;
@@ -682,7 +679,7 @@ void prepare_quantization::prepare_asymmetric_quantization(program_impl &p) {
 
             if (asymmetric_weights) {
                 if (!new_w_zp || !new_weights)
-                    CLDNN_ERROR_MESSAGE(convolution_node.id(), "Unexpected nullptr in asymmetric quantization for weights optimization");
+                    CLDNN_ERROR_MESSAGE(new_conv_node.id(), "Unexpected nullptr in asymmetric quantization for weights optimization");
 
                 auto& zp_users = new_w_zp->users;
                 auto& wei_users = new_weights->users;
index b3ef892..969885c 100644 (file)
 
 using namespace cldnn;
 
-remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations)
-    : base_pass("remove_redundant_reorders"), lo(lo_ref), enable_reorder_fusing(enable_reorder_fusing), update_implementations(update_implementations) {}
+remove_redundant_reorders::remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing, bool update_implementations,
+    bool remove_output_reorders)
+    : base_pass("remove_redundant_reorders"), lo(lo_ref), enable_reorder_fusing(enable_reorder_fusing), update_implementations(update_implementations),
+    remove_output_reorders(remove_output_reorders) {}
 
 void remove_redundant_reorders::run(program_impl& p) {
     auto update_implementation = [&](program_node& node) {
@@ -159,9 +161,13 @@ void remove_redundant_reorders::run(program_impl& p) {
 
         auto& r_node = node->as<reorder>();
 
+        bool no_output_optimization = remove_output_reorders ?
+            r_node.is_output() && (r_node.get_dependency(0).is_output() || r_node.get_dependency(0).is_type<input_layout>() ||
+                r_node.get_dependency(0).can_be_optimized()) : r_node.is_output();
+
         if (r_node.has_mean() ||
             !r_node.get_primitive()->subtract_per_feature.empty() ||
-            r_node.is_output() ||
+            no_output_optimization ||
             !r_node.get_fused_activations_funcs().empty())
             continue;
 
@@ -170,7 +176,7 @@ void remove_redundant_reorders::run(program_impl& p) {
 
         // Optimize reorder b_fs_yx_fsv16 -> bfyx when spatials are equal to 1. In this case we can reinterpret buffer,
         // but pads need to be handled correctly.
-        if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx &&
+        if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx && !r_node.is_output() &&
             i_layout.size.spatial[0] == 1 && i_layout.size.spatial[1] == 1 &&
             o_layout.data_padding.upper_size() == (tensor)0 && o_layout.data_padding.lower_size() == (tensor)0) {
             r_node.can_be_optimized(true);
index 55eca03..1ad6612 100644 (file)
@@ -22,6 +22,7 @@
 #include "layout_optimizer.h"
 #include "program_impl.h"
 #include "program_helpers.h"
+#include "mvn_inst.h"
 #include <vector>
 #include <memory>
 #include <list>
@@ -344,7 +345,7 @@ void insert_reorders(program_impl& p, const std::map<program_node*, format::type
             continue;
 
         auto fmt = fmt_map.at(node);
-        if (fmt == format::any)
+        if (fmt == format::any || format::is_image(fmt))
             continue;
 
         insert_reorders_in_dir<direction_e::forwards>(p, fmt_map, rf, node);
@@ -358,7 +359,7 @@ void insert_reorders(program_impl& p, const std::map<program_node*, format::type
             continue;
 
         auto fmt = fmt_map.at(node);
-        if (fmt == format::any)
+        if (fmt == format::any || format::is_image(fmt))
             continue;
 
         insert_reorders_in_dir<direction_e::backwards>(p, fmt_map, rf, node);
@@ -371,6 +372,70 @@ void reorder_inputs::run(program_impl& p, layout_optimizer& lo, reorder_factory&
     auto fmt_map = get_preferred_formats(p, lo);
     propagate_formats(p, fmt_map, lo);
     minimize_local_reorders(p, fmt_map, lo);
+
+    // WA START ============================================================================================================
+    if (lo.get_optimization_attributes().b_fs_yx_fsv16_network) {
+        // This is a temprorary work-around for known bad case until byxf_af32 handling will be corrected in layout_optimizer.
+        //
+        // Find pattern:
+        //    mvn(int8, b_fs_yx_fsv16, [x,16,1280,720]) -> conv(int8, byxf_af32, [x,3,1280,720]) -> mvn(*, bfyx) ->
+        // Replace with:
+        //    mvn(b_fs_yx_fsv16) -> conv(b_fs_yx_fsv16) -> mvn(b_fs_yx_fsv16) ->
+        //
+        // Generally for such convolution b_fs_yx_fsv16 will always perform better than byxf_af32,
+        // but to avoid unvalidated int8 b_fs_yx_fsv16 networks and potential regressions this WA is needed.
+        // Additionally reorder from af32 -> bfyx will take ~9 times longer than actual convolution.
+        for (auto& node_ptr : p.get_processing_order()) {
+            if (!node_ptr->is_in_data_flow() || !node_ptr->is_type<convolution>() || fmt_map.at(node_ptr) != format::byxf_af32)
+                continue;
+
+            auto& conv_node = node_ptr->as<convolution>();
+
+            bool input_path =
+                conv_node.input().get_output_layout().data_type == data_types::i8 &&
+                conv_node.input().is_type<mvn>() &&
+                fmt_map.at(&conv_node.input()) == format::b_fs_yx_fsv16;
+            bool output_path =
+                conv_node.get_users().size() == 1 &&
+                conv_node.get_users().front()->is_type<mvn>() &&
+                fmt_map.at(conv_node.get_users().front()) == format::bfyx &&
+                conv_node.get_users().front()->get_users().size() == 1 &&
+                !conv_node.get_users().front()->as<mvn>().get_primitive()->across_channels;
+
+            if (!input_path || !output_path)
+                continue;
+
+            auto in_lay = conv_node.input().get_output_layout();
+            auto out_lay = conv_node.get_output_layout();
+            auto wei_lay = conv_node.weights().get_output_layout();
+            bool correct_layouts =
+                // weights
+                wei_lay.data_type == data_types::i8 &&
+                wei_lay.size.spatial[0] == 3 && wei_lay.size.spatial[1] == 3 &&
+                // input/output
+                in_lay.data_type == data_types::i8 && out_lay.data_type == data_types::i8 &&
+                in_lay.size.feature[0] == 16 && out_lay.size.feature[0] == 3 &&
+                in_lay.size.spatial[0] == 1280 && out_lay.size.spatial[0] == 1280 &&
+                in_lay.size.spatial[1] == 720 && out_lay.size.spatial[1] == 720;
+
+            if (!correct_layouts)
+                continue;
+
+            bool correct_conv =
+                conv_node.get_groups() == 1 && conv_node.get_split() == 1 && conv_node.get_deformable_groups() == 1 &&
+                !conv_node.get_depthwise_sep_opt() && !conv_node.get_transposed() &&
+                !conv_node.activations_zero_points_term() && !conv_node.weights_zero_points_term() && !conv_node.compensation_term() &&
+                conv_node.get_primitive()->dilation == tensor(1);
+
+            if (!correct_conv)
+                continue;
+
+            fmt_map.at(node_ptr) = format::b_fs_yx_fsv16;
+            fmt_map.at(conv_node.get_users().front()) = format::b_fs_yx_fsv16;
+        }
+    }
+    // WA END ==============================================================================================================
+
     insert_reorders(p, fmt_map, rf);
 
     for (auto n : p.get_processing_order()) {
index 835222c..67788d9 100644 (file)
@@ -18,7 +18,9 @@
 #pragma once
 #include "api/depth_to_space.hpp"
 #include "primitive_inst.h"
+#include "kernel_selector/core/actual_kernels/depth_to_space/depth_to_space_kernel_base.h"
 #include <string>
+#include <memory>
 
 namespace cldnn {
 template <>
@@ -29,6 +31,9 @@ public:
     using parent::parent;
 
     program_node& input(size_t index = 0) const { return get_dependency(index); }
+    std::shared_ptr<kernel_selector::fuse_params> get_fuse_params() const override {
+        return std::make_shared<kernel_selector::depth_to_space_fuse_params>();
+    }
 };
 
 using depth_to_space_node = typed_program_node<depth_to_space>;
index 7c5ddd7..34f8c96 100644 (file)
@@ -176,6 +176,7 @@ public:
 private:
     void run(program_impl& p) override;
     void fuse_conv_eltwise(program_impl& p, program_node* node);
+    void fuse_conv_depth_to_space(program_impl& p, program_node* node);
     layout_optimizer& _lo;
     bool b_fs_yx_fsv16_opt;
 };
@@ -289,13 +290,15 @@ private:
 
 class remove_redundant_reorders : public base_pass {
 public:
-    explicit remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing = false, bool update_implementations = false);
+    explicit remove_redundant_reorders(layout_optimizer& lo_ref, bool enable_reorder_fusing = false, bool update_implementations = false,
+        bool remove_output_reorders = false);
     void run(program_impl& p) override;
 
 private:
     layout_optimizer& lo;
     bool enable_reorder_fusing;
     bool update_implementations;
+    bool remove_output_reorders;
 };
 
 class reorder_inputs : public base_pass {
index 9277513..7ec2622 100644 (file)
@@ -112,7 +112,27 @@ struct program_helpers {
                               const layout& target_layout,
                               size_t begin_offset,
                               size_t end_offset);
-    static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
+
     static std::pair<bool, bool> are_layouts_identical(layout const& l1, layout const& l2);
+
+    // helper functions for deconvolution optimizations
+    static void reshape_deconvolution_weights(const std::vector<float> &deconv_weights,
+                                              const int channels,
+                                              const int kernel_width,
+                                              const int kernel_height,
+                                              const int scale_factor,
+                                              std::vector<std::vector<std::vector<float> > >& subpixel_weights);
+    template <typename T>
+    static void set_weights_values(T* mem, std::vector<std::vector<std::vector<float> > > args) {
+        for (uint32_t x = 0; x < static_cast<uint32_t>(args.size()); ++x) {
+            for (uint32_t y = 0; y < static_cast<uint32_t>(args[x].size()); ++y) {
+                for (uint32_t z = 0; z < static_cast<uint32_t>(args[x][y].size()); ++z) {
+                    *mem = static_cast<T>(args[x][y][z]);
+                    mem++;
+                }
+            }
+        }
+    }
+    static layout get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split);
 };
 }  // namespace cldnn
index 5849c66..4fe44eb 100644 (file)
@@ -111,6 +111,8 @@ inline std::string fmt_to_str(format fmt) {
             return "b_fs_zyx_fsv16";
         case format::bs_fs_zyx_bsv16_fsv16:
             return "bs_fs_zyx_bsv16_fsv16";
+        case format::image_2d_rgba:
+            return "image_2d_rgba";
 
         case format::oiyx:
             return "oiyx";
@@ -134,6 +136,8 @@ inline std::string fmt_to_str(format fmt) {
             return "image_2d_weights_winograd_6x3_s1_xfbyb";
         case format::os_iyx_osv16:
             return "os_iyx_osv16";
+        case format::os_is_yx_osv16_isv16:
+            return "os_is_yx_osv16_isv16";
         case format::os_iyx_osv32:
             return "os_iyx_osv32";
         case format::os_iyx_osv64:
@@ -197,6 +201,8 @@ inline std::string fmt_to_str(format fmt) {
             return "g_os_is_yx_isv8_osv16_isv2";
         case format::g_os_is_zyx_isv16_osv16:
             return "g_os_is_zyx_isv16_osv16";
+        case format::g_os_is_yx_osv16_isv4:
+            return "g_os_is_yx_osv16_isv4";
         default:
             return "unknown (" + std::to_string(fmt.value) + ")";
     }
index fed6a81..0d9d3b9 100644 (file)
@@ -155,6 +155,8 @@ kernel_selector::data_layout to_data_layout(format f) {
             return kernel_selector::data_layout::bs_fs_yx_bsv16_fsv16;
         case format::nv12:
             return kernel_selector::data_layout::nv12;
+        case format::image_2d_rgba:
+            return kernel_selector::data_layout::image_2d_rgba;
         default:
             throw std::invalid_argument("Format f (" +  std::to_string((int32_t)f.value) + ") is not a proper data layout");
     }
@@ -206,6 +208,8 @@ cldnn::format from_data_layout(kernel_selector::data_layout l) {
             return cldnn::format::b_fs_yx_fsv4;
         case kernel_selector::data_layout::nv12:
             return cldnn::format::nv12;
+        case kernel_selector::data_layout::image_2d_rgba:
+            return cldnn::format::image_2d_rgba;
         default:
             throw std::invalid_argument("Unable to convert data layout " + std::to_string(l) + " to tensor format");
     }
@@ -225,6 +229,8 @@ kernel_selector::weights_layout to_weights_layout(format f) {
             return kernel_selector::weights_layout::yxio;
         case format::os_iyx_osv16:
             return kernel_selector::weights_layout::os_iyx_osv16;
+        case format::os_is_yx_osv16_isv16:
+            return kernel_selector::weights_layout::os_is_yx_osv16_isv16;
         case format::os_iyx_osv32:
             return kernel_selector::weights_layout::os_iyx_osv32;
         case format::os_iyx_osv64:
@@ -314,6 +320,8 @@ kernel_selector::weights_layout to_weights_layout(format f) {
             return kernel_selector::weights_layout::g_os_is_yx_isv8_osv16_isv2;
         case format::g_os_is_zyx_isv16_osv16:
             return kernel_selector::weights_layout::g_os_is_zyx_isv16_osv16;
+        case format::g_os_is_yx_osv16_isv4:
+            return kernel_selector::weights_layout::g_os_is_yx_osv16_isv4;
         default:
             throw std::invalid_argument("Unable to convert tensor layout " + fmt_to_str(f) + " to weights layout");
     }
@@ -335,6 +343,8 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
             return cldnn::format::yxfb;
         case kernel_selector::weights_layout::os_iyx_osv16:
             return cldnn::format::os_iyx_osv16;
+        case kernel_selector::weights_layout::os_is_yx_osv16_isv16:
+            return cldnn::format::os_is_yx_osv16_isv16;
         case kernel_selector::weights_layout::os_iyx_osv32:
             return cldnn::format::os_iyx_osv32;
         case kernel_selector::weights_layout::os_iyx_osv64:
@@ -417,6 +427,8 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
             return cldnn::format::g_os_is_yx_isv8_osv16_isv2;
         case kernel_selector::weights_layout::g_os_is_zyx_isv16_osv16:
             return cldnn::format::g_os_is_zyx_isv16_osv16;
+        case kernel_selector::weights_layout::os_is_yx_osv16_isv4:
+            return cldnn::format::g_os_is_yx_osv16_isv4;
         default:
             return cldnn::format::bfyx;
     }
@@ -455,7 +467,6 @@ kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split
     kernel_selector::n_dims vec(kernel_selector::DataTensor::ChannelsCount(ks_layout));
 
     size_t pitch = 1;
-
     auto new_vals = vals;
 
     if (ks_layout == kernel_selector::Tensor::byxf_af32) {
@@ -633,6 +644,7 @@ void set_params(const program_node& node, kernel_selector::params& params) {
 
     params.engineInfo.bSubGroupSupport = context->extension_supported("cl_intel_subgroups");
     params.engineInfo.bSubGroupShortSupport = context->extension_supported("cl_intel_subgroups_short");
+    params.engineInfo.bSubGroupCharSupport = context->extension_supported("cl_intel_subgroups_char");
     params.engineInfo.bFP16Support = context->extension_supported("cl_khr_fp16");
     params.engineInfo.bFP64Support = context->extension_supported("cl_khr_fp64");
     params.engineInfo.bIMADSupport = device_info.supports_imad != 0;
index 21c11a7..bbf7825 100644 (file)
@@ -28,6 +28,8 @@
 #include "eltwise_inst.h"
 #include "pooling_inst.h"
 #include "permute_inst.h"
+#include "quantize_inst.h"
+#include "mvn_inst.h"
 #include <vector>
 #include <memory>
 #include <utility>
@@ -121,6 +123,11 @@ bool layout_optimizer::is_format_supported(program_node& node, format::type fmt)
     if (node.is_type<fully_connected>() && fmt == format::byxf)
         return false;
 
+    if (node.is_type<mvn>() && fmt == format::b_fs_yx_fsv16 &&
+        node.get_dependency(0).get_output_layout().data_type != data_types::i8 &&
+        node.get_dependency(0).get_output_layout().data_type != data_types::u8)
+        return false;
+
     if (node.is_type<input_layout>())
         return node.get_output_layout().format == fmt;
 
@@ -190,7 +197,10 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next,
         fmt_prev == format::bfyx &&
         ((fmt_next == format::fs_b_yx_fsv32 && next.as<convolution>().get_primitive()->groups == 1) ||
         (fmt_next == format::b_fs_yx_fsv32 && prev_output_layout.size.feature[0] == 3) ||
-        (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3) ||
+        (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 &&
+        prev_output_layout.size.feature[0] == 3 &&
+        (next_output_layout.data_type != data_types::i8 && next_output_layout.data_type != data_types::u8)) ||
+         (fmt_next == format::b_fs_yx_fsv16 && next_output_layout.size.feature[0] >= 16 && prev_output_layout.size.feature[0] == 3) ||
         (fmt_next == format::bs_fs_yx_bsv16_fsv16 && next_output_layout.size.feature[0] % 16 == 0 && prev_output_layout.size.feature[0] == 3)))
         return true;
 
@@ -219,7 +229,7 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node
         return true;
 
     if (prev.is_type<quantize>() &&
-        (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32))
+        (fmt_next == format::b_fs_yx_fsv4 || fmt_next == format::b_fs_yx_fsv32 || fmt_next == format::b_fs_zyx_fsv32 || fmt_next == format::b_fs_yx_fsv16))
         return true;
 
     return false;
@@ -332,7 +342,24 @@ bool layout_optimizer::convolution_b_fs_yx_fsv16_opt(layout const &input_layout,
                                                      const layout &weights_layout,
                                                      std::shared_ptr<const convolution> conv,
                                                      bool weak_restrictions) {
-    // A set of rules that define when b_fs_yx_fsv16 mem format can be used
+    // A set of rules that define when b_fs_yx_fsv16 mem format can be used for int8 case
+    bool i8_dt_case = (input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8) &&
+        weights_layout.data_type == data_types::i8 &&
+        (conv->activations_zero_points.empty() && conv->weights_zero_points.empty());  // only symmetric
+    if (i8_dt_case) {
+        auto ks_x = weights_layout.size.spatial[0];
+        auto ks_y = weights_layout.size.spatial[1];
+        if (input_layout.size.spatial[2] == 1 &&
+            input_layout.size.batch[0] < 16 &&
+            ((ks_x == 7 && ks_y == 7) || (ks_x == 3 && ks_y == 3) || (ks_x == 1 && ks_y == 1) || (ks_x == 5 && ks_y == 5)) &&
+            weights_layout.size.batch[0] >= 16 &&
+            ((conv->groups == 1 && conv->split() == 1) ||
+             conv->groups == static_cast<uint32_t>(input_layout.size.feature[0]) ||
+             conv->split() == static_cast<int32_t>(input_layout.size.feature[0])) &&
+            conv->dilation == tensor{ 1 })
+            return true;
+    }
+    // A set of rules that define when b_fs_yx_fsv16 mem format can be used for fp16/fp32 case
     auto feature_block_size = 16;
     auto correct_data_type = input_layout.data_type == data_types::f16 || input_layout.data_type == data_types::f32;
     correct_data_type &= weights_layout.data_type == input_layout.data_type;
@@ -530,7 +557,7 @@ format layout_optimizer::imad_case(convolution_node const& node) const {
     if (dims_count == 5 && is_grouped) {
         return format::bfzyx;
     } else if (dims_count == 4 && is_grouped && !is_dw) {
-        return format::bfyx;
+        return format::b_fs_yx_fsv4;
     }
 
     bool asymmetric_quantization = node.activations_zero_points_term() || node.weights_zero_points_term();
@@ -567,7 +594,12 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout,
     const float cond_denom = _total_conv > 0 ? 1.0f / static_cast<float>(_total_conv) : 1.0f;
 
     if ((input_layout.data_type == data_types::u8 || input_layout.data_type == data_types::i8)) {
-        expected_format = imad_case(node);
+        if ((_optimization_attributes.b_fs_yx_fsv16_network &&
+            convolution_b_fs_yx_fsv16_opt(input_layout, output_or_weights_layout, prim))) {
+            expected_format = cldnn::format::b_fs_yx_fsv16;
+        } else {
+            expected_format = imad_case(node);
+        }
         expected_tensor = current_layout.size;
     } else if (_optimization_attributes.b_fs_zyx_fsv16_network &&
             convolution_b_fs_zyx_fsv16_opt(input_layout,
index 1596e09..4d2c049 100644 (file)
@@ -38,6 +38,7 @@
 #include "reshape_inst.h"
 #include "activation_inst.h"
 #include "scale_inst.h"
+#include "depth_to_space_inst.h"
 #include "convolution_inst.h"
 #include "concatenation_inst.h"
 #include "crop_inst.h"
@@ -55,6 +56,7 @@
 #include "proposal_inst.h"
 #include "reorder_inst.h"
 #include "split_inst.h"
+#include "mvn_inst.h"
 #include "to_string_utils.h"
 #include "gpu/memory_gpu.h"
 
@@ -466,6 +468,9 @@ void program_impl::post_optimize_graph(bool is_internal) {
         // ToDo remove hidden dependencies from propagate_constants pass
         apply_opt_pass<propagate_constants>();
     }
+
+    if (options.get<build_option_type::optimize_data>()->enabled())
+        apply_opt_pass<remove_redundant_reorders>(lo, false, true, true);  // pass to remove output reorders while all others graph optimizations were done
 }
 
 // mark if the node is constant assuming that all dependencies are marked properly
@@ -1116,7 +1121,12 @@ void program_impl::set_layout_optimizer_attributes(layout_optimizer& lo) {
             prim.type() != cldnn::prior_box::type_id() &&
             prim.type() != cldnn::resample::type_id() &&
             prim.type() != cldnn::crop::type_id() &&
-            prim.type() != cldnn::scale::type_id())
+            prim.type() != cldnn::scale::type_id() &&
+            prim.type() != cldnn::depth_to_space::type_id() &&
+            (prim.type() != cldnn::mvn::type_id()
+             || (prim.as<mvn>().input().get_output_layout().data_type != data_types::u8 &&
+                 prim.as<mvn>().input().get_output_layout().data_type != data_types::i8)
+             || prim.as<mvn>().get_primitive()->across_channels))
             can_use_fsv16 = false;
 
         // WA to keep bfyx_f16 layout disabled for some topologies where it leads to regressions.
index 1e54d7f..2ff72f9 100644 (file)
@@ -21,6 +21,7 @@
 #include "data_inst.h"
 #include <algorithm>
 #include <utility>
+#include <vector>
 
 namespace cldnn {
 // helper function for merging the weights/biases buffers on cpu side for depthwise separable convolution optimization
@@ -44,6 +45,67 @@ void program_helpers::merge_buffers(engine_impl& engine,
     data_node.attach_memory(*data_to_allocate, false);
 }
 
+void program_helpers::reshape_deconvolution_weights(const std::vector<float> &deconv_weights,
+    const int channels,
+    const int kernel_width,
+    const int kernel_height,
+    const int scale_factor,
+    std::vector<std::vector<std::vector<float> > >& subpixel_weights) {
+
+    std::vector<std::vector<float> > weights(channels);
+
+    int pad_zero_x = kernel_width % 2 == 0 ? 0 : 1;
+    int pad_zero_y = kernel_height % 2 == 0 ? 0 : 1;
+
+    // reshape 9x9 deconv weights, for example 32 9x9 deconv weights to 32 10x10 conv weights
+    for (int f = 0; f < channels; ++f) {
+        for (int kernel_y = 0; kernel_y < kernel_height; ++kernel_y) {
+            for (int kernel_x = 0; kernel_x < kernel_width; ++kernel_x) {
+                int index = f * kernel_width * kernel_height + kernel_y * kernel_width + kernel_x;
+                weights[f].push_back(deconv_weights[index]);
+            }
+            if (pad_zero_x == 1) {    // pad with zero on x axis
+                weights[f].push_back(0.f);
+            }
+        }
+        if (pad_zero_y == 1) {    // pad a line on y axis with zero
+            for (int kernel_x = 0; kernel_x < kernel_width + pad_zero_x; ++kernel_x) {
+                weights[f].push_back(0.f);
+            }
+        }
+    }
+
+    // reshape 32 10x10 weights to 4 32 5x5 weights
+    for (int s = 0; s < scale_factor*scale_factor; ++s) {
+        subpixel_weights[s].resize(channels);
+    }
+
+    const int kernel_sz = kernel_width + pad_zero_x;
+
+    auto get_row_index = [](int index, const int kernel_sz)->int {
+        bool isRowEven = (index / (kernel_sz)) % 2 == 0 ? true : false;
+        bool isColEven = (index % 2) == 0 ? true : false;
+        int kernel_num = isRowEven ? (isColEven ? 0 : 1) : isColEven ? 2 : 3;
+        return kernel_num;
+    };
+
+    int feature_num = static_cast<int>(weights.size());
+    for (int f = 0; f < feature_num; ++f) {
+        for (int i = 0; i < static_cast<int>(weights[f].size()); ++i) {
+            int row = get_row_index(i, kernel_sz);
+            subpixel_weights[row][f].push_back(weights[f][i]);
+        }
+    }
+
+    // dump the weights for the shuffled kernel
+    int subpixel_conv_num = static_cast<int>(subpixel_weights.size());
+    for (int s = 0; s < subpixel_conv_num; ++s) {
+        for (int row = 0; row < static_cast<int>(subpixel_weights[s].size()); ++row) {
+            std::reverse(std::begin(subpixel_weights[s][row]), std::end(subpixel_weights[s][row]));
+        }
+    }
+}
+
 // helper function for getting target layout used in depthwise sep optimization
 layout program_helpers::get_weights_layout(typed_program_node<cldnn::data>& data_node, int32_t split) {
     auto mem_layout = data_node.get_output_layout();
index 846e83e..0f74654 100644 (file)
@@ -48,12 +48,14 @@ layout reorder_inst::calc_output_layout(reorder_node const& node) {
         if (ofmt != ifmt)
             return layout(odt, ofmt, data_size, op);
 
-        CLDNN_ERROR_MESSAGE(node.id(), "Reordering between winograd weights and data formats is unsupported");
+        CLDNN_ERROR_MESSAGE(node.id(), "No image_nv12 to image_nv12 reorder is supported");
     } else if (ofmt.is_winograd() && ifmt.is_winograd()) {
         if (ofmt == ifmt)
             return layout(odt, ofmt, input_layout.size, op);
 
         CLDNN_ERROR_MESSAGE(node.id(), "Reordering between winograd weights and data formats is unsupported");
+    } else if (ifmt == format::image_2d_rgba) {
+        return layout(data_types::f16, format::bfyx, input_layout.size, op);
     }
 
     // transformation of data from standard to winograd
index 8cd209e..fa0fc02 100644 (file)
@@ -77,8 +77,11 @@ layout strided_slice_inst::calc_output_layout(strided_slice_node const& node) {
     std::vector<int32_t> output_shape;
     if (std::find(desc->new_axis_mask.begin(), desc->new_axis_mask.end(), 1) == desc->new_axis_mask.end()) {
         for (size_t i = 0; i < dims_num; ++i) {
-            int32_t outputDimSize = (end[i] - begin[i]) / strides[i];
-            if ((end[i] - begin[i]) % strides[i] != 0)
+            int32_t b = begin[i] < 0 ? input_layout.size.sizes(input_format)[i] - 1 : begin[i];
+            int32_t e = end[i] < 0 ? input_layout.size.sizes(input_format)[i] - 1 : end[i];
+            int32_t s = strides[i];
+            int32_t outputDimSize = std::abs((e - b) / s);
+            if ((e - b) % s != 0)
                 outputDimSize++;
             output_shape.push_back(outputDimSize);
         }
index ca6eee0..9678e61 100644 (file)
@@ -108,6 +108,80 @@ TEST(concat_gpu, mixed_input_types) {
     }
 }
 
+TEST(concat_gpu, mixed_input_types_5d) {
+    const auto& engine = get_test_engine();
+
+    auto input0 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 1, 1, 1, 4, 3 } });
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 1, 1, 1, 4, 3 } });
+    auto input2 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 1, 1, 1, 4, 3 } });
+    auto input3 = memory::allocate(engine, { data_types::f16, format::bfzyx, { 1, 1, 1, 4, 3 } });
+
+    set_values(input0, { half_t(1.0f), half_t(2.0f), half_t(3.0f),
+                         half_t(4.0f), half_t(2.0f), half_t(2.0f),
+                         half_t(3.0f), half_t(4.0f), half_t(3.0f),
+                         half_t(3.0f), half_t(3.0f), half_t(5.0f) });
+    set_values(input1, { half_t(11), half_t(12), half_t(13),
+                         half_t(14), half_t(12), half_t(12),
+                         half_t(13), half_t(14), half_t(13),
+                         half_t(13), half_t(13), half_t(15) });
+    set_values(input2, { half_t(21), half_t(22), half_t(23),
+                         half_t(24), half_t(22), half_t(22),
+                         half_t(23), half_t(24), half_t(23),
+                         half_t(23), half_t(23), half_t(25) });
+    set_values(input3, { half_t(31.f), half_t(32.f), half_t(33.f),
+                         half_t(34.f), half_t(32.f), half_t(32.f),
+                         half_t(33.f), half_t(34.f), half_t(33.f),
+                         half_t(33.f), half_t(33.f), half_t(35.f) });
+
+    VF<float> output_vec = {
+            1.0f, 2.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f, 4.0f, 3.0f, 3.0f, 3.0f, 5.0f,
+            11.0f, 12.0f, 13.0f, 14.0f, 12.0f, 12.0f, 13.0f, 14.0f, 13.0f, 13.0f, 13.0f, 15.0f,
+            21.0f, 22.0f, 23.0f, 24.0f, 22.0f, 22.0f, 23.0f, 24.0f, 23.0f, 23.0f, 23.0f, 25.0f,
+            31.0f, 32.0f, 33.0f, 34.0f, 32.0f, 32.0f, 33.0f, 34.0f, 33.0f, 33.0f, 33.0f, 35.0f };
+
+    topology topology(
+            input_layout("input0", input0.get_layout()),
+            input_layout("input1", input1.get_layout()),
+            input_layout("input2", input2.get_layout()),
+            input_layout("input3", input3.get_layout()),
+            concatenation("concat",
+                          { "input0", "input1", "input2", "input3" },
+                          concatenation::concatenation_axis::along_f,
+                          data_types::f32,
+                          padding{ { 0,0,0,0 }, 0 })
+    );
+
+    network network(engine, topology);
+    network.set_input_data("input0", input0);
+    network.set_input_data("input1", input1);
+    network.set_input_data("input2", input2);
+    network.set_input_data("input3", input3);
+
+    auto outputs = network.execute();
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "concat");
+
+    auto output_memory = outputs.at("concat").get_memory();
+    auto output_layout = output_memory.get_layout();
+    auto output_ptr = output_memory.pointer<float>();
+
+    int z_size = output_layout.size.spatial[2];
+    int y_size = output_layout.size.spatial[1];
+    int x_size = output_layout.size.spatial[0];
+    int f_size = output_layout.size.feature[0];
+    int b_size = output_layout.size.batch[0];
+    EXPECT_EQ(output_layout.format, format::bfzyx);
+    EXPECT_EQ(z_size, 3);
+    EXPECT_EQ(y_size, 4);
+    EXPECT_EQ(x_size, 1);
+    EXPECT_EQ(f_size, 4);
+    EXPECT_EQ(b_size, 1);
+
+    for (size_t x = 0; x < output_layout.count(); ++x) {
+        EXPECT_EQ(output_vec[x], output_ptr[x]);
+    }
+}
+
 using TestParamType_concat = ::testing::tuple<size_t,   // 0 - Input Batch size
         std::vector<size_t>,                            // 1 - Inputs Features Sizes
         size_t,                                         // 2 - Input Y Size
index 13a1e67..ecd134e 100644 (file)
@@ -80,7 +80,7 @@ struct convolution_accumulator<uint8_t> {
 template<typename InputT, typename OutputT = InputT, typename WeightsT = InputT,  typename AccT = typename convolution_accumulator<InputT>::type>
 VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int stride_y, int stride_x, float bias, int dilation_y = 1, int dilation_x = 1,
         int input_padding_y = 0, int input_padding_x = 0, int output_padding_y = 0,
-        int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false,
+        int output_padding_x = 0, size_t f_begin = 0, size_t f_end = 0, bool depthwise = false, bool grouped = false,
         const VF<InputT>& data_zp = {}, const WeightsT& weights_zp = 0)
 {
     size_t kernel_extent_y = dilation_y * (filter[0].size() - 1) + 1;
@@ -100,27 +100,25 @@ VVF<OutputT> reference_convolve(VVVF<InputT> &input, VVVF<WeightsT> &filter, int
                 for (size_t yf = 0; yf < filter[0].size(); ++yf) {
                     int yi = -input_padding_y + (int)yf * dilation_y + stride_y * (int)y;
                     bool yi_inside = yi >= 0 && (int)input[0].size() > yi;
-                    if (!yi_inside && !asymm_data) continue;
+                    if (!yi_inside) continue;
                     for (size_t xf = 0; xf < filter[0][0].size(); ++xf) {
                         int xi = -input_padding_x + (int)xf * dilation_x + stride_x * (int)x;
                         bool xi_inside = xi >= 0 && (int)input[0][0].size() > xi;
-                        if (!xi_inside && !asymm_data) continue;
+                        if (!xi_inside) continue;
 
-                        AccT input_val;
-                        if (xi_inside && yi_inside) {
-                            input_val = static_cast<AccT>(input[f][yi][xi]);
-                        } else {
-                            input_val = static_cast<AccT>(0);
-                        }
+                        auto input_val = static_cast<AccT>(input[f][yi][xi]);
 
                         if (asymm_data) {
                             input_val = input_val - static_cast<AccT>(data_zp[f]);
                         }
 
                         AccT weights_val;
-                        if (!depthwise) {
+                        if (!depthwise && !grouped) {
                             weights_val = static_cast<AccT>(filter[f][yf][xf]);
-                        } else {
+                        } else if (grouped) {
+                            weights_val = static_cast<AccT>(filter[f - filter_begin][yf][xf]);
+                        }
+                        else {
                             weights_val = static_cast<AccT>(filter[0][yf][xf]);
                         }
 
@@ -5116,6 +5114,16 @@ using TestParamType_convolution_depthwise_gpu = ::testing::tuple<int,   // 0 - I
         int,   // 5 - Output padding
         bool>; // 6 - With bias
 
+using TestParamType_grouped_convolution_gpu = ::testing::tuple<  int,    // 0 - Input X size
+        int,   // 1 - Input Y size
+        int,   // 2 - Input features
+        int,   // 3 - Output features
+        int,   // 4 - Kernel sizeX
+        int,   // 5 - Kernel sizeY
+        int,   // 6 - Groups number
+        int,   // 7 - Stride
+        int>;  // 8 - Batch
+
 struct convolution_gpu : public ::testing::TestWithParam<TestParamType_convolution_gpu>
 {
     static std::string
@@ -5184,6 +5192,22 @@ struct convolution_depthwise_gpu_fsv16 : public ::testing::TestWithParam<TestPar
     }
 };
 
+struct convolution_grouped_gpu : public ::testing::TestWithParam<TestParamType_grouped_convolution_gpu> {
+    static std::string PrintToStringParamName(
+        testing::TestParamInfo<TestParamType_grouped_convolution_gpu> param_info) {
+        // construct a readable name
+        return "in" + std::to_string(testing::get<0>(param_info.param)) + "x" +
+               std::to_string(testing::get<1>(param_info.param)) + "y" +
+               std::to_string(testing::get<2>(param_info.param)) + "f" +
+               "_output" + std::to_string(testing::get<3>(param_info.param)) + "f" +
+               "_filter" + std::to_string(testing::get<4>(param_info.param)) + "x" +
+                           std::to_string(testing::get<5>(param_info.param)) + "y" +
+               "_groups" + std::to_string(testing::get<6>(param_info.param)) +
+               "_stride" + std::to_string(testing::get<7>(param_info.param)) +
+               "_batch"  + std::to_string(testing::get<8>(param_info.param));
+    }
+};
+
 TEST_P(convolution_gpu, b_fs_yx_fsv4)
 {
     const int in_B = 2;
@@ -5890,6 +5914,108 @@ TEST_P(convolution_gpu_fs_byx_fsv32, fs_byx_fsv32)
                 }
 }
 
+TEST(convolution_f16_fsv_gpu, convolution_f16_fsv_gpu_padding) {
+    const auto& engine = get_test_engine();
+
+    if (!engine.get_info().supports_fp16)
+    {
+        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
+        EXPECT_EQ(1, 1);
+        return;
+    }
+
+    const int batch_num = 2;
+    const int input_xy = 32;
+    const int input_f = 96;
+    const int output_f = 192;
+    const int filter_xy = 1;
+    const int stride = 1;
+    const int output_xy = 1 + (input_xy - filter_xy) / stride;
+
+    auto input_size = tensor(batch_num, input_f, input_xy, input_xy);
+    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_xy, input_xy, -1, 1);
+    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
+    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
+    set_values(input_mem, input_data_bfyx);
+
+    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
+    auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
+    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+    auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfyx, weights_size });
+    set_values(weights_mem, weights_data_bfyx);
+
+    // Will be used to store reference values calculated in branches depending on bias
+    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));
+
+    topology topology(
+        input_layout("input", input_mem.get_layout()),
+        data("weights_fsv", weights_mem));
+
+    // add input padding by X and Y
+    layout w_pad(data_types::f16, format::bfyx, input_size, padding({ 0,0,1,1 }, { 0, 0, 0, 0 }));
+    topology.add(reorder("input_fsv", "input", w_pad));
+
+    // Generate bias data
+    auto biases_size = tensor(1, output_f, 1, 1);
+    auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
+    auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfyx, biases_size });
+    set_values(biases_mem, biases_data);
+
+    // Calculate reference values
+    for (auto bi = 0; bi < batch_num; ++bi)
+    {
+        for (auto ofi = 0; ofi < output_f; ++ofi)
+        {
+            reference_result[bi][ofi] = reference_convolve(
+                input_data[bi], weights_data[ofi],
+                stride, stride,
+                biases_data[ofi],
+                1, 1);
+        }
+    }
+
+    topology.add(data("biases_fsv", biases_mem));
+
+    auto conv_fsv = convolution("conv_fsv", "input_fsv", { "weights_fsv" }, { "biases_fsv" },
+        { 1, 1, stride, stride }, { 0, 0, 0, 0 });
+
+    topology.add(conv_fsv);
+
+    build_options options;
+    implementation_desc conv_impl = { format::fs_b_yx_fsv32, "convolution_gpu_bfyx_to_fs_byx_fsv32" };
+    options.set_option(build_option::force_implementations({ {"conv_fsv", conv_impl} }));
+    options.set_option(build_option::optimize_data(true));
+    network network(engine, topology, options);
+
+    network.set_input_data("input", input_mem);
+
+    network.execute();
+
+    auto out_mem = network.get_output("conv_fsv").get_memory();
+    auto out_ptr = out_mem.pointer<FLOAT16>();
+
+    ASSERT_EQ(out_mem.get_layout().format, format::fs_b_yx_fsv32);
+
+    for (int bi = 0; bi < batch_num; ++bi)
+        for (int fi = 0; fi < output_f; ++fi)
+            for (int yi = 0; yi < output_xy; ++yi)
+                for (int xi = 0; xi < output_xy; ++xi)
+                {
+                    auto val_ref = reference_result[bi][fi][yi][xi];
+                    auto val = out_ptr[(fi / 32) * batch_num * output_xy * output_xy * 32 +
+                        bi * output_xy * output_xy * 32 +
+                        yi * output_xy * 32 +
+                        xi * 32 +
+                        fi % 32];
+                    auto equal = are_equal(val_ref, val, 1e-2f);
+                    EXPECT_TRUE(equal);
+                    if (!equal)
+                    {
+                        std::cout << "At b = " << bi << ", fi = " << fi << ", xi = " << xi << ", yi = " << yi << std::endl;
+                    }
+                }
+}
+
 using TestParamType_convolution_gpu_with_crop = ::testing::tuple<int,   // 0 - Filter size
     int,   // 1 - Input size
     int,   // 2 - Input/output features
@@ -6098,6 +6224,148 @@ TEST_P(convolution_gpu_fs_byx_fsv32_crop, fs_byx_fsv32_crop)
 }
 
 
+
+TEST(convolution_gpu, bfyx_iyxo_5x5_fp16)
+{
+
+    const auto& engine = get_test_engine();
+
+    if (!engine.get_info().supports_fp16)
+    {
+        std::cout << "[ SKIPPED ] The test is skipped (cl_khr_fp16 is not supported)." << std::endl;
+        EXPECT_EQ(1, 1);
+        return;
+    }
+    
+    const int batch_num = 1;
+    const int output_f = 4;
+
+    const int input_f = 32;
+    const int filter_xy = 5;
+    const int stride = 1;
+    const int output_padding = 0;
+    const bool with_bias = false;
+    const int input_size_x = 64;
+    const int input_size_y = 20;
+
+
+    const int input_offset = -(filter_xy / 2);
+
+    const int output_x = 1 + (input_size_x + 2 * (-input_offset) - filter_xy) / stride + 2 * output_padding;
+
+    const int output_y = 1 + (input_size_y + 2 * (-input_offset) - filter_xy) / stride + 2 * output_padding;
+
+    auto input_size = tensor(batch_num, input_f, input_size_x, input_size_y);
+    auto input_data = generate_random_4d<FLOAT16>(batch_num, input_f, input_size_y, input_size_x, -1, 1);
+
+    auto input_data_bfyx = flatten_4d(format::bfyx, input_data);
+    auto input_mem = memory::allocate(engine, { data_types::f16, format::bfyx, input_size });
+    set_values(input_mem, input_data_bfyx);
+
+    auto weights_size = tensor(output_f, input_f, filter_xy, filter_xy);
+    auto weights_data = generate_random_4d<FLOAT16>(output_f, input_f, filter_xy, filter_xy, -1, 1);
+    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+    auto weights_mem = memory::allocate(engine, { data_types::f16, format::bfyx, weights_size });
+
+    set_values(weights_mem, weights_data_bfyx);
+
+    // Will be used to store reference values calculated in branches depending on bias
+    auto reference_result = VVVVF<FLOAT16>(batch_num, VVVF<FLOAT16>(output_f));
+
+    topology topology(
+        input_layout("input", input_mem.get_layout()),
+        data("weights_fsv", weights_mem)
+    );
+
+    if (with_bias)
+    {
+        // Generate bias data
+        auto biases_size = tensor(1, output_f, 1, 1);
+        auto biases_data = generate_random_1d<FLOAT16>(output_f, -1, 1);
+        auto biases_mem = memory::allocate(engine, { data_types::f16, format::bfyx, biases_size });
+        set_values(biases_mem, biases_data);
+
+        // Calculate reference values with bias
+        for (auto bi = 0; bi < batch_num; ++bi)
+        {
+            for (auto ofi = 0; ofi < output_f; ++ofi)
+            {
+                reference_result[bi][ofi] = reference_convolve(
+                    input_data[bi], weights_data[ofi],
+                    stride, stride, biases_data[ofi],
+                    1, 1,                               // dilation
+                    -input_offset, -input_offset,       // input padding
+                    output_padding, output_padding);
+            }
+        }
+
+        topology.add(data("biases_fsv", biases_mem));
+
+        auto conv_fsv = convolution("conv_fsv", "input", { "weights_fsv" }, { "biases_fsv" },
+                                    { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
+        conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);
+
+        topology.add(conv_fsv);
+    }
+    else
+    {
+        
+        // Calculate reference values without bias
+        for (auto bi = 0; bi < batch_num; ++bi)
+        {
+            for (auto ofi = 0; ofi < output_f; ++ofi)
+            {
+                reference_result[bi][ofi] = reference_convolve(
+                    input_data[bi], weights_data[ofi],
+                    stride, stride,
+                    0,                                  // bias
+                    1, 1,                               // dilation
+                    -input_offset, -input_offset,       // input padding
+                    output_padding, output_padding);
+            }
+        }
+        
+        
+        auto conv_fsv = convolution("conv_fsv", "input", { "weights_fsv" },
+            { 1, 1, stride, stride }, { 0, 0, input_offset, input_offset });
+        conv_fsv.output_padding = padding({ 0, 0, output_padding, output_padding }, 0.f);
+
+        topology.add(conv_fsv);
+    }
+
+
+    build_options options;
+    implementation_desc conv_impl = { format::bfyx, "" };
+    options.set_option(build_option::optimize_data(true));
+    network network(engine, topology, options);
+
+    network.set_input_data("input", input_mem);
+
+    network.execute();
+
+    auto out_mem = network.get_output("conv_fsv").get_memory();
+    auto out_ptr = out_mem.pointer<FLOAT16>();
+    
+    for (int bi = 0; bi < batch_num; ++bi)
+        for (int fi = 0; fi < output_f; ++fi)
+            for (int yi = 0; yi < output_y; ++yi)
+                for (int xi = 0; xi < output_x; ++xi)
+                {
+                    auto val_ref = reference_result[bi][fi][yi][xi];
+                    auto val = out_ptr[bi * output_f * output_x * output_y +
+                                        fi * output_y * output_x  +
+                                        yi * output_x +
+                                        xi];
+                    auto equal = are_equal(val_ref, val, 1e-2f);
+                    EXPECT_TRUE(equal);
+                    if (!equal)
+                    {
+                        std::cout << "At b = " << bi << ", fi = " << fi << ", xi = " << xi << ", yi = " << yi << std::endl;
+                    }
+                }
+                
+}
+
 INSTANTIATE_TEST_CASE_P(convolution_gpu_block,
                         convolution_gpu_block_layout,
                         ::testing::Values(
@@ -7364,8 +7632,131 @@ INSTANTIATE_TEST_CASE_P(convolution_depthwise_gpu_bfyx,
                         ),
                         convolution_depthwise_gpu::PrintToStringParamName);
 
+INSTANTIATE_TEST_CASE_P(convolution_grouped_b_fs_yx_fsv4,
+                        convolution_grouped_gpu,
+                        ::testing::Values(
+                            // Input X size, Input Y size, Input features, Output features, Kernel size X, Kernel size
+                            // Y, Groups number, Stride, Output padding, Batch
+                            TestParamType_grouped_convolution_gpu(4, 4, 16, 16, 3, 3, 4, 1, 1),
+                            TestParamType_grouped_convolution_gpu(4, 4, 8, 4, 2, 2, 2, 1, 4),
+                            TestParamType_grouped_convolution_gpu(8, 8, 16, 16, 4, 4, 4, 1, 1),
+                            TestParamType_grouped_convolution_gpu(17, 17, 32, 96, 3, 3, 2, 2, 2),
+                            TestParamType_grouped_convolution_gpu(16, 16, 8, 48, 2, 2, 2, 2, 1),
+                            TestParamType_grouped_convolution_gpu(3, 3, 48, 96, 2, 2, 2, 8, 1),
+                            TestParamType_grouped_convolution_gpu(6, 6, 8, 26, 3, 3, 2, 4, 1)),
+                        convolution_grouped_gpu::PrintToStringParamName);
+
+TEST_P(convolution_grouped_gpu, grouped_b_fs_yx_fsv4) {
+    const auto& engine = get_test_engine();
+
+    const int input_x = testing::get<0>(GetParam()),
+              input_y = testing::get<1>(GetParam()),
+              input_f = testing::get<2>(GetParam()),
+              output_f = testing::get<3>(GetParam()),
+              filter_x = testing::get<4>(GetParam()),
+              filter_y = testing::get<5>(GetParam()),
+              groups = testing::get<6>(GetParam()),
+              stride = testing::get<7>(GetParam()),
+              batch_num = testing::get<8>(GetParam()),
+              output_padding = 0,
+              input_offset_y = (filter_x - 1) / 2,
+              input_offset_x = (filter_y - 1) / 2;
+
+    auto input_size = tensor(batch(batch_num), feature(input_f), spatial(input_x, input_y));
+    auto input_rnd = generate_random_4d<uint8_t>(batch_num, input_f, input_y, input_x, 0, 255);
+    auto input_rnd_vec = flatten_4d<uint8_t>(format::bfyx, input_rnd);
+    auto input = memory::allocate(engine, {data_types::u8, format::bfyx, input_size});
+    set_values(input, input_rnd_vec);
+
+    auto weights_size = tensor(group(groups), batch(output_f / groups), feature(input_f / groups), spatial(filter_x, filter_y));
+    VVVVVF<int8_t> weights_rnd = generate_random_5d<int8_t>(groups, output_f / groups, input_f / groups, filter_y, filter_x, -127, 127);
+    auto weights_lay = layout(data_types::i8, format::goiyx, weights_size);
+
+    std::vector<int8_t> weights_flat(weights_lay.get_linear_size());
+    for (int gi = 0; gi < groups; ++gi)
+        for (int ofi = 0; ofi < output_f / groups; ++ofi)
+            for (int ifi = 0; ifi < input_f / groups; ++ifi)
+                for (int kyi = 0; kyi < filter_y; ++kyi)
+                    for (int kxi = 0; kxi < filter_x; ++kxi) {
+                        tensor coords = tensor(group(gi), batch(ofi), feature(ifi), spatial(kxi, kyi, 0, 0));
+                        size_t offset = weights_lay.get_linear_offset(coords);
+                        weights_flat[offset] = weights_rnd[gi][ofi][ifi][kyi][kxi];
+                    }
+    auto weights = memory::allocate(engine, {data_types::i8, format::goiyx, weights_size});
+    set_values(weights, weights_flat);
+
+    VVVVF<float> expected_result(batch_num, VVVF<float>(output_f));
+
+    // Calculate reference values without bias
+    for (int bi = 0; bi < batch_num; ++bi)
+        for (int gi = 0; gi < groups; ++gi)
+            for (int ofi = 0; ofi < (int)weights_rnd[0].size(); ++ofi) {
+                bool grouped = groups > 1;
+                int f_begin = gi * input_f / groups;
+                int f_end = gi * input_f / groups + input_f / groups;
+
+                expected_result[bi][ofi + gi * output_f / groups] = reference_convolve<uint8_t, float, int8_t>(
+                    input_rnd[bi], weights_rnd[gi][ofi],  // input, weights
+                    stride, stride,                       // strides
+                    0,                                    // bias
+                    1, 1,                                 // dilation
+                    input_offset_y, input_offset_x,       // input padding
+                    0, 0,                                 // output_padding
+                    f_begin, f_end,                       // f_begin, f_end
+                    false,                                // depthwise
+                    grouped);                             // grouped
+            }
+
+    topology topology(input_layout("input", input.get_layout()),
+                      data("weights", weights),
+                      reorder("input_fsv", "input", {data_types::u8, format::b_fs_yx_fsv4, input_size}),
+                      convolution("conv",
+                                  "input_fsv",
+                                  {"weights"},
+                                  groups,
+                                  {1, 1, stride, stride},
+                                  {0, 0, -input_offset_x, -input_offset_y},
+                                  {1, 1, 1, 1},
+                                  padding({0, 0, output_padding, output_padding}, 0.f)));
+
+    build_options options;
+    options.set_option(build_option::optimize_data(true));
+    implementation_desc conv_impl = {format::b_fs_yx_fsv4, "fused_conv_eltwise_gpu_imad"};
+    options.set_option(build_option::force_implementations({{"conv", conv_impl}}));
+
+    network network(engine, topology, options);
+    network.set_input_data("input", input);
+    network.execute();
+
+    auto out_mem = network.get_output("conv").get_memory();
+    auto out_ptr = out_mem.pointer<float>();
+    auto out_lay = out_mem.get_layout();
+
+    ASSERT_EQ(out_mem.get_layout().format, format::b_fs_yx_fsv4);
+    ASSERT_EQ(out_lay.size.batch[0], expected_result.size());
+    ASSERT_EQ(out_lay.size.feature[0], expected_result[0].size());
+    ASSERT_EQ(out_lay.size.spatial[1], expected_result[0][0].size());
+    ASSERT_EQ(out_lay.size.spatial[0], expected_result[0][0][0].size());
+
+    for (int bi = 0; bi < batch_num; ++bi)
+        for (int ofi = 0; ofi < output_f; ++ofi)
+            for (int yi = 0; yi < (int)expected_result[0][0].size(); ++yi)
+                for (int xi = 0; xi < (int)expected_result[0][0][0].size(); ++xi) {
+                    tensor coords = tensor(batch(bi), feature(ofi), spatial(xi, yi, 0, 0));
+                    auto offset = out_lay.get_linear_offset(coords);
+                    auto val = out_ptr[offset];
+                    auto val_ref = expected_result[bi][ofi][yi][xi];
+                    auto equal = are_equal(val_ref, val, 1e-2f);
+                    if (!equal) {
+                        std::cout << "Value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val << std::endl;
+                        std::cout << "Reference value at batch: " << bi << ", output_f: " << ofi << ", y: " << yi << ", x: " << xi << " = " << val_ref << std::endl;
+                    }
+                    EXPECT_TRUE(equal);
+                }
+}
+
 template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_test_base : public testing::Test {
+class convolution_test_base {
 public:
     virtual topology build_topology(const cldnn::engine& engine) {
         auto input_lay = layout(input_type(), input_format(), input_size());
@@ -7377,12 +7768,30 @@ public:
 
         auto topo = topology();
         topo.add(input_layout("input", input_lay));
+        std::string input_id = "input";
+        if (has_input_zp()) {
+            auto input_zp_lay = layout(input_type(), format::bfyx, tensor(feature(input_features())));
+            auto input_zp_mem = memory::allocate(engine, input_zp_lay);
+            set_values(input_zp_mem, _input_zp);
+            topo.add(data("input_zp", input_zp_mem));
+            topo.add(eltwise("input_asymm", { "input", "input_zp" }, eltwise_mode::sub));
+            input_id = "input_asymm";
+        }
         topo.add(data("weights", wei_mem));
+        std::string weights_id = "weights";
+        if (has_weights_zp()) {
+            auto weights_zp_lay = layout(weights_type(), format::bfyx, tensor(batch(output_features())));
+            auto weights_zp_mem = memory::allocate(engine, weights_zp_lay);
+            set_values(weights_zp_mem, _weights_zp);
+            topo.add(data("weights_zp", weights_zp_mem));
+            topo.add(eltwise("weights_asymm", { "weights", "weights_zp" }, eltwise_mode::sub));
+            weights_id = "weights_asymm";
+        }
         if (!has_bias()) {
             auto conv_prim = convolution(
                 "conv",
-                "input",
-                { "weights" },
+                input_id,
+                { weights_id },
                 static_cast<uint32_t>(groups()),
                 tensor(batch(0), feature(0), spatial(_stride_x, _stride_y)),
                 tensor(batch(0), feature(0), spatial(_offset_x, _offset_y)),
@@ -7396,8 +7805,8 @@ public:
             topo.add(data("bias", bias_mem));
             auto conv_prim = convolution(
                 "conv",
-                "input",
-                { "weights" },
+                input_id,
+                { weights_id },
                 { "bias" },
                 static_cast<uint32_t>(groups()),
                 tensor(batch(0), feature(0), spatial(_stride_x, _stride_y)),
@@ -7420,7 +7829,8 @@ public:
         auto topo = build_topology(engine);
 
         auto build_opts = build_options(
-            build_option::optimize_data(true)
+            build_option::optimize_data(true),
+            build_option::force_implementations({ {"conv", {input_format(), ""}} })
         );
         auto prog = program(engine, topo, build_opts);
 
@@ -7445,6 +7855,17 @@ public:
         auto out_lay = out_mem.get_layout();
         auto out_ptr = out_mem.cldnn::memory::template pointer<OutputT>();
 
+        std::stringstream description;
+        for (auto i : net.get_primitives_info()) {
+            if (i.original_id == "conv") {
+                description << "  kernel: " << i.kernel_id << std::endl;
+            }
+        }
+        description << "  executed: ";
+        for (auto e : net.get_executed_primitive_ids()) {
+            description << e << ", ";
+        }
+
         ASSERT_EQ(out_lay.data_type, output_type());
         ASSERT_EQ(out_lay.size.batch[0], expected.size());
         ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
@@ -7458,8 +7879,9 @@ public:
                         tensor coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
                         size_t offset = out_lay.get_linear_offset(coords);
 
-                        EXPECT_EQ(out_ptr[offset], expected[bi][fi][yi][xi])
-                            << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
+                        ASSERT_EQ(out_ptr[offset], expected[bi][fi][yi][xi])
+                            << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi << std::endl
+                            << description.str();
                     }
     }
 
@@ -7491,10 +7913,20 @@ public:
         _dilation_y = dilation_y;
     }
 
+    void set_input_zp(VF<InputT> input_zp) {
+        _input_zp = std::move(input_zp);
+    }
+
+    void set_weights_zp(VF<WeightsT> weights_zp) {
+        _weights_zp = std::move(weights_zp);
+    }
+
 protected:
     VVVVF<InputT> _input;
     VVVVF<WeightsT> _weights;
     VF<OutputT> _bias;
+    VF<InputT> _input_zp;
+    VF<WeightsT> _weights_zp;
     format::type _input_fmt;
     int _stride_x, _stride_y;
     int _offset_x, _offset_y;
@@ -7511,6 +7943,8 @@ protected:
     size_t groups() const { return input_features() / weights_input_features(); }
 
     bool has_bias() { return _bias.size() > 0; }
+    bool has_input_zp() { return _input_zp.size() > 0; }
+    bool has_weights_zp() { return _weights_zp.size() > 0; }
 
     data_types input_type() const { return type_to_data_type<InputT>::value; }
     format input_format() const { return _input_fmt; }
@@ -7545,78 +7979,23 @@ struct convolution_random_test_all_params {
     bool with_bias;
     size_t groups;
     format::type input_format;
+    bool asymmetric_weights;
+    bool asymmetric_data;
 };
 
-using convolution_random_test_params = std::tuple<
-    size_t,                     // batch
-    size_t,                     // input features
-    size_t,                     // output features
-    std::tuple<size_t, size_t>, // input x, y
-    std::tuple<size_t, size_t>, // filter x, y
-    std::tuple<int, int>,       // stride x, y
-    std::tuple<int, int>,       // offset x, y
-    std::tuple<int, int>,       // dilation x, y
-    bool,                       // with bias
-    format::type                // input format
->;
-
-static convolution_random_test_all_params convert_random_test_params(const convolution_random_test_params& params) {
-    convolution_random_test_all_params all_params;
-    std::forward_as_tuple(
-        all_params.batch,
-        all_params.input_features,
-        all_params.output_features,
-        std::forward_as_tuple(all_params.input_xy[0], all_params.input_xy[1]),
-        std::forward_as_tuple(all_params.filter_xy[0], all_params.filter_xy[1]),
-        std::forward_as_tuple(all_params.stride_xy[0], all_params.stride_xy[1]),
-        std::forward_as_tuple(all_params.offset_xy[0], all_params.offset_xy[1]),
-        std::forward_as_tuple(all_params.dilation_xy[0], all_params.dilation_xy[1]),
-        all_params.with_bias,
-        all_params.input_format) = params;
-    all_params.groups = 1;
-    return all_params;
-}
-
-using convolution_random_test_depthwise_params = std::tuple<
-    size_t,                     // batch
-    size_t,                     // input/output features
-    std::tuple<size_t, size_t>, // input x, y
-    std::tuple<size_t, size_t>, // filter x, y
-    std::tuple<int, int>,       // stride x, y
-    std::tuple<int, int>,       // offset x, y
-    std::tuple<int, int>,       // dilation x, y
-    bool,                       // with bias
-    format::type                // input format
->;
-
-static convolution_random_test_all_params convert_random_test_params(const convolution_random_test_depthwise_params& params) {
-    convolution_random_test_all_params all_params;
-    std::forward_as_tuple(
-        all_params.batch,
-        all_params.input_features,
-        std::forward_as_tuple(all_params.input_xy[0], all_params.input_xy[1]),
-        std::forward_as_tuple(all_params.filter_xy[0], all_params.filter_xy[1]),
-        std::forward_as_tuple(all_params.stride_xy[0], all_params.stride_xy[1]),
-        std::forward_as_tuple(all_params.offset_xy[0], all_params.offset_xy[1]),
-        std::forward_as_tuple(all_params.dilation_xy[0], all_params.dilation_xy[1]),
-        all_params.with_bias,
-        all_params.input_format) = params;
-    all_params.groups = all_params.input_features;
-    all_params.output_features = all_params.input_features;
-    return all_params;
-}
-
 template <typename InputT, typename WeightsT, typename OutputT>
 class convolution_random_test_base : public convolution_test_base<InputT, WeightsT, OutputT> {
 public:
     virtual VVVVF<OutputT> calculate_reference() {
         VVVVF<OutputT> expected = VVVVF<OutputT>(this->batch_num(), VVVF<OutputT>(this->output_features()));
         bool depthwise = this->groups() == this->input_features();
+        bool grouped = (this->groups() > 1 && !depthwise) ? true : false;
         for (size_t bi = 0; bi < this->batch_num(); ++bi)
         for (size_t fi = 0; fi < this->output_features(); ++fi) {
             size_t f_begin = depthwise ? fi : 0;
             size_t f_end = (depthwise ? fi : 0) + this->weights_input_features();
             auto bias = this->has_bias() ? this->_bias[fi] : static_cast<OutputT>(0);
+            auto weights_zp = this->has_weights_zp() ? this->_weights_zp[fi] : static_cast<WeightsT>(0);
             expected[bi][fi] = reference_convolve<InputT, OutputT, WeightsT>(
                 this->_input[bi],
                 this->_weights[fi],
@@ -7631,7 +8010,10 @@ public:
                 0,
                 f_begin,
                 f_end,
-                depthwise);
+                depthwise,
+                grouped,
+                this->_input_zp,
+                weights_zp);
         }
         return expected;
     }
@@ -7644,6 +8026,8 @@ public:
         auto weights_data = generate_random_4d<WeightsT>(
             params.output_features, wei_in_f, params.filter_xy[1], params.filter_xy[0], -256, 256);
         auto bias_data = params.with_bias ? generate_random_1d<OutputT>(params.output_features, -256, 256) : VF<OutputT>();
+        auto weights_zp_data = params.asymmetric_weights ? generate_random_1d<WeightsT>(params.output_features, -256, 256) : VF<WeightsT>();
+        auto input_zp_data = params.asymmetric_data ? generate_random_1d<InputT>(params.input_features, -256, 256) : VF<InputT>();
 
         this->set_input(params.input_format, std::move(input_data));
         this->set_weights(std::move(weights_data));
@@ -7651,6 +8035,8 @@ public:
         this->set_strides(params.stride_xy[0], params.stride_xy[1]);
         this->set_offsets(params.offset_xy[0], params.offset_xy[1]);
         this->set_dilation(params.dilation_xy[0], params.dilation_xy[1]);
+        this->set_weights_zp(std::move(weights_zp_data));
+        this->set_input_zp(std::move(input_zp_data));
     }
 
     void run_random(const convolution_random_test_all_params& params) {
@@ -7663,7 +8049,8 @@ public:
 
 // construct a readable name in format as follows:
 // <out format>_i<input>_w<weights>_s<stride>_ofs<offset>_d<dilation>_g<groups>_<bias>
-static std::string to_string_convolution_all_params(const convolution_random_test_all_params& params) {
+static std::string to_string_convolution_all_params(const testing::TestParamInfo<convolution_random_test_all_params>& param_info) {
+    auto& params = param_info.param;
     int Batch = (int)params.batch;
     int iF = (int)params.input_features;
     int oF = (int)params.output_features;
@@ -7675,6 +8062,8 @@ static std::string to_string_convolution_all_params(const convolution_random_tes
     auto groups = params.groups;
     bool Bias = params.with_bias;
     format::type iType = params.input_format;  // input format
+    bool asymm_weights = params.asymmetric_weights;
+    bool asymm_input = params.asymmetric_data;
     // Wrapper for negative walues as ex. "-1" will generate invalid gtest param string
     auto to_string_neg = [](int val) {
         if (val >= 0)
@@ -7690,142 +8079,13 @@ static std::string to_string_convolution_all_params(const convolution_random_tes
         "_ofs" + to_string_neg(Offset[0]) + 'x' + to_string_neg(Offset[1]) +
         "_d" + std::to_string(Dilation[0]) + 'x' + std::to_string(Dilation[1]) +
         "_g" + std::to_string(groups) +
-        (Bias ? "_bias" : "");
-}
-
-template <typename T>
-std::string to_string_convolution_random_params(testing::TestParamInfo<T> param_info) {
-    return to_string_convolution_all_params(convert_random_test_params(param_info.param));
+        (Bias ? "_bias" : "") + (asymm_weights ? "_wzp" : "") + (asymm_input ? "_izp" : "");
 }
 
 template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_random_test : public convolution_random_test_base<InputT, WeightsT, OutputT>
-                              , public testing::WithParamInterface<convolution_random_test_params> {};
-
-
-using convolution_random_test_s8s8f32 = convolution_random_test<int8_t, int8_t, float>;
-using convolution_random_test_u8s8f32 = convolution_random_test<uint8_t, int8_t, float>;
-
-TEST_P(convolution_random_test_s8s8f32, random) {
-    ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-TEST_P(convolution_random_test_u8s8f32, random) {
-    ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-INSTANTIATE_TEST_CASE_P(
-    b_fs_yx_fsv4,
-    convolution_random_test_s8s8f32,
-    testing::Combine(
-        testing::Values(1, 2),                                                             // batch
-        testing::Values(3, 32),                                                            // input features
-        testing::Values(16, 32),                                                           // output features
-        testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
-        testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
-        testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)),             // strides x, y
-        testing::Values(std::pair<int, int>(0, 0)),                                        // offsets x, y
-        testing::Values(std::pair<int, int>(1, 1)),                                        // dilation x, y
-        testing::Values(false, true),                                                      // bias
-        testing::Values(format::b_fs_yx_fsv4)                                              // input format
-    ),
-    to_string_convolution_random_params<convolution_random_test_params>);
-
-INSTANTIATE_TEST_CASE_P(
-    b_fs_yx_fsv4,
-    convolution_random_test_u8s8f32,
-    testing::Combine(
-        testing::Values(1, 2),                                                             // batch
-        testing::Values(3, 32),                                                            // input features
-        testing::Values(16, 32),                                                           // output features
-        testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
-        testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
-        testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)),             // strides x, y
-        testing::Values(std::pair<int, int>(0, 0)),                                        // offsets x, y
-        testing::Values(std::pair<int, int>(1, 1)),                                        // dilation x, y
-        testing::Values(false, true),                                                      // bias
-        testing::Values(format::b_fs_yx_fsv4)                                              // input format
-    ),
-    to_string_convolution_random_params<convolution_random_test_params>);
-
-INSTANTIATE_TEST_CASE_P(
-    b_fs_yx_fsv4_1x1_lwg_opt,
-    convolution_random_test_s8s8f32,
-    testing::Combine(
-        testing::Values(1),                               // batch
-        testing::Values(128, 256, 512),                   // input features
-        testing::Values(64),                              // output features
-        testing::Values(std::pair<size_t, size_t>(3, 3)), // input x, y
-        testing::Values(std::pair<size_t, size_t>(1, 1)), // filter x, y
-        testing::Values(std::pair<int, int>(1, 1)),       // strides x, y
-        testing::Values(std::pair<int, int>(0, 0)),       // offsets x, y
-        testing::Values(std::pair<int, int>(1, 1)),       // dilation x, y
-        testing::Values(false),                           // bias
-        testing::Values(format::b_fs_yx_fsv4)             // input format
-    ),
-    to_string_convolution_random_params<convolution_random_test_params>);
-
-template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_random_dw_test : public convolution_random_test_base<InputT, WeightsT, OutputT>
-                                 , public testing::WithParamInterface<convolution_random_test_depthwise_params> {};
-
-using convolution_random_test_dw_s8s8f32 = convolution_random_dw_test<int8_t, int8_t, float>;
-using convolution_random_test_dw_u8s8f32 = convolution_random_dw_test<uint8_t, int8_t, float>;
-
-TEST_P(convolution_random_test_dw_s8s8f32, random) {
-    ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-TEST_P(convolution_random_test_dw_u8s8f32, random) {
-    ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-INSTANTIATE_TEST_CASE_P(
-    b_fs_yx_fsv4,
-    convolution_random_test_dw_s8s8f32,
-    testing::Combine(
-        testing::Values(1, 2),                                                             // batch
-        testing::Values(3, 32),                                                            // input/output features
-        testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
-        testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
-        testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)),             // strides x, y
-        testing::Values(std::pair<int, int>(0, 0), std::pair<int, int>(-1, -1)),           // offsets x, y
-        testing::Values(std::pair<int, int>(1, 1)),                                        // dilation x, y
-        testing::Values(false, true),                                                      // bias
-        testing::Values(format::b_fs_yx_fsv4)                                              // input format
-    ),
-    to_string_convolution_random_params<convolution_random_test_depthwise_params>);
-
-INSTANTIATE_TEST_CASE_P(
-    b_fs_yx_fsv4,
-    convolution_random_test_dw_u8s8f32,
-    testing::Combine(
-        testing::Values(1, 2),                                                             // batch
-        testing::Values(3, 32),                                                            // input/output features
-        testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
-        testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
-        testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)),             // strides x, y
-        testing::Values(std::pair<int, int>(0, 0), std::pair<int, int>(-1, -1)),           // offsets x, y
-        testing::Values(std::pair<int, int>(1, 1)),                                        // dilation x, y
-        testing::Values(false, true),                                                      // bias
-        testing::Values(format::b_fs_yx_fsv4)                                              // input format
-    ),
-    to_string_convolution_random_params<convolution_random_test_depthwise_params>);
-
-INSTANTIATE_TEST_CASE_P(
-    special_cases,
-    convolution_random_test_dw_u8s8f32,
-    testing::Values(
-        convolution_random_test_depthwise_params(
-            1, 32, std::pair<size_t, size_t>(28, 28), std::pair<size_t, size_t>(3, 3),
-            std::pair<int, int>(1, 1), std::pair<int, int>(-1, -1), std::pair<int, int>(1, 1), true, format::b_fs_yx_fsv4)
-    ),
-    to_string_convolution_random_params<convolution_random_test_depthwise_params>);
-
-template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_scale_random_test : public convolution_random_test<InputT, WeightsT, OutputT> {
+class convolution_scale_random_test : public convolution_random_test_base<InputT, WeightsT, OutputT> {
 public:
-    using parent = convolution_random_test<InputT, WeightsT, OutputT>;
+    using parent = convolution_random_test_base<InputT, WeightsT, OutputT>;
 
     virtual primitive_id output_primitive_id() const {
         return "scale_wa_reorder";
@@ -7855,9 +8115,9 @@ public:
         auto expected = parent::calculate_reference();
 
         for (size_t bi = 0; bi < this->batch_num(); ++bi)
-        for (size_t fi = 0; fi < this->output_features(); ++fi) {
-            expected[bi][fi] = reference_scale_post_op<OutputT>(expected[bi][fi], _scale[fi], _shift[fi]);
-        }
+            for (size_t fi = 0; fi < this->output_features(); ++fi) {
+                expected[bi][fi] = reference_scale_post_op<OutputT>(expected[bi][fi], _scale[fi], _shift[fi]);
+            }
         return expected;
     }
 
@@ -7872,197 +8132,153 @@ protected:
     VF<OutputT> _shift;
 };
 
-using convolution_scale_random_test_s8s8f32 = convolution_scale_random_test<int8_t, int8_t, float>;
-using convolution_scale_random_test_u8s8f32 = convolution_scale_random_test<uint8_t, int8_t, float>;
-
-TEST_P(convolution_scale_random_test_s8s8f32, random) {
-    ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-TEST_P(convolution_scale_random_test_u8s8f32, random) {
-    ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
-}
-
-INSTANTIATE_TEST_CASE_P(
-    b_fs_yx_fsv4,
-    convolution_scale_random_test_s8s8f32,
-    testing::Combine(
-        testing::Values(1, 2),                                                             // batch
-        testing::Values(3, 32),                                                            // input features
-        testing::Values(16, 32),                                                           // output features
-        testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
-        testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
-        testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)),             // strides x, y
-        testing::Values(std::pair<int, int>(0, 0)),                                        // offsets x, y
-        testing::Values(std::pair<int, int>(1, 1)),                                        // dilation x, y
-        testing::Values(false, true),                                                      // bias
-        testing::Values(format::b_fs_yx_fsv4)                                              // input format
-    ),
-    to_string_convolution_random_params<convolution_random_test_params>);
-
-INSTANTIATE_TEST_CASE_P(
-    b_fs_yx_fsv4,
-    convolution_scale_random_test_u8s8f32,
-    testing::Combine(
-        testing::Values(1, 2),                                                             // batch
-        testing::Values(3, 32),                                                            // input features
-        testing::Values(16, 32),                                                           // output features
-        testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
-        testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
-        testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)),             // strides x, y
-        testing::Values(std::pair<int, int>(0, 0)),                                        // offsets x, y
-        testing::Values(std::pair<int, int>(1, 1)),                                        // dilation x, y
-        testing::Values(false, true),                                                      // bias
-        testing::Values(format::b_fs_yx_fsv4)                                              // input format
-    ),
-    to_string_convolution_random_params<convolution_random_test_params>);
-
-template <typename InputT, typename WeightsT, typename OutputT>
-class convolution_asymm_weights_data_random_test : public convolution_random_test<InputT, WeightsT, OutputT> {
-    using parent = convolution_random_test<InputT, WeightsT, OutputT>;
-
-    virtual primitive_id output_primitive_id() const {
-       return "conv_wa_reorder";
-    }
+class convolution_random_smoke_test : public testing::TestWithParam<convolution_random_test_all_params> {};
 
-    topology build_topology(const cldnn::engine& engine) override {
-        auto input_lay = layout(this->input_type(), this->input_format(), this->input_size());
-        auto wei_lay = layout(this->weights_type(), format::bfyx, this->weights_size());
-        auto data_zp_lay = layout(this->input_type(), format::bfyx, tensor(batch(1), feature(this->input_features()), spatial(1, 1)));
-        auto wei_zp_lay = layout(this->weights_type(), format::bfyx, tensor(batch(this->output_features()), feature(1), spatial(1, 1)));
+using convolution_random_test_s8s8f32 = convolution_random_test_base<int8_t, int8_t, float>;
+using convolution_random_test_u8s8f32 = convolution_random_test_base<uint8_t, int8_t, float>;
 
-        auto wei_mem = memory::allocate(engine, wei_lay);
-        auto data_zp_mem = memory::allocate(engine, data_zp_lay);
-        auto wei_zp_mem = memory::allocate(engine, wei_zp_lay);
-        auto weights_flat = flatten_4d(format::bfyx, this->_weights);
-        set_values(wei_mem, weights_flat);
-        set_values(data_zp_mem, _data_zp);
-        set_values(wei_zp_mem, _weights_zp);
+using convolution_scale_random_test_s8s8f32 = convolution_scale_random_test<int8_t, int8_t, float>;
+using convolution_scale_random_test_u8s8f32 = convolution_scale_random_test<uint8_t, int8_t, float>;
 
-        auto topo = topology();
-        topo.add(input_layout("input", input_lay));
-        topo.add(data("weights", wei_mem));
-        topo.add(data("data_zp", data_zp_mem));
-        topo.add(data("weights_zp", wei_zp_mem));
-        auto input_asymm_prim = eltwise("input_asymm", "input", "data_zp", eltwise_mode::sub);
-        auto weights_asymm_prim = eltwise("weights_asymm", "weights", "weights_zp", eltwise_mode::sub);
-        input_asymm_prim.output_data_type = data_types::f32;
-        weights_asymm_prim.output_data_type = data_types::f32;
-        topo.add(input_asymm_prim);
-        topo.add(weights_asymm_prim);
-        if (!this->has_bias()) {
-            auto conv_prim = convolution(
-                "conv",
-                "input_asymm",
-                { "weights_asymm" },
-                tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)),
-                tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)),
-                tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y)));
-            conv_prim.output_data_type = this->output_type();
-            topo.add(conv_prim);
-        } else {
-            auto bias_lay = layout(this->output_type(), format::bfyx, tensor(feature(this->output_features())));
-            auto bias_mem = memory::allocate(engine, bias_lay);
-            set_values(bias_mem, this->_bias);
-            topo.add(data("bias", bias_mem));
-            auto conv_prim = convolution(
-                "conv",
-                "input_asymm",
-                { "weights_asymm" },
-                { "bias" },
-                tensor(batch(0), feature(0), spatial(this->_stride_x, this->_stride_y)),
-                tensor(batch(0), feature(0), spatial(this->_offset_x, this->_offset_y)),
-                tensor(batch(0), feature(0), spatial(this->_dilation_x, this->_dilation_y)));
-            conv_prim.output_data_type = this->output_type();
-            topo.add(conv_prim);
+struct params_generator : std::vector<convolution_random_test_all_params> {
+    params_generator& smoke_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false) {
+        std::vector<size_t> batches = { 1, 2 };
+        for (auto b : batches) {
+            // 7x7
+            push_back(convolution_random_test_all_params{
+                b, 3, 32, { 28, 28 }, { 7, 7 }, { 2, 2 }, { -3, -3 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            // 3x3
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 14, 14 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 14, 14 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            // 1x1
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 28, 28 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 28, 28 }, { 1, 1 }, { 2, 2 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            // 5x5
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 28, 28 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            push_back(convolution_random_test_all_params{
+                b, 32, 48, { 28, 28 }, { 5, 5 }, { 2, 2 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            // depthwise
+            push_back(convolution_random_test_all_params{
+                b, 64, 64, { 19, 19 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 64, input_format, asymm_weights, asymm_data });
+            push_back(convolution_random_test_all_params{
+                b, 64, 64, { 19, 19 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 1, 1 }, true, 64, input_format, asymm_weights, asymm_data });
+            // dilation
+            push_back(convolution_random_test_all_params{
+                b, 32, 24, { 19, 19 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 2, 2 }, true, 1, input_format, asymm_weights, asymm_data });
+            push_back(convolution_random_test_all_params{
+                b, 32, 24, { 19, 19 }, { 3, 3 }, { 2, 2 }, { -1, -1 }, { 2, 2 }, true, 1, input_format, asymm_weights, asymm_data });
         }
-        topo.add(reorder("conv_wa_reorder", "conv", format::bfyx, this->output_type()));
-
-        return topo;
+        return *this;
+    }
+
+    params_generator& extra_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false) {
+        std::vector<size_t> batches = { 1, 2 };
+        for (auto b : batches) {
+            // 1x1
+            push_back(convolution_random_test_all_params{
+                b, 23, 41, { 19, 19 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            push_back(convolution_random_test_all_params{
+                b, 23, 41, { 19, 19 }, { 1, 1 }, { 2, 2 }, { 0, 0 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            // 3x3
+            push_back(convolution_random_test_all_params{
+                b, 16, 28, { 14, 14 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            push_back(convolution_random_test_all_params{
+                b, 23, 41, { 19, 17 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            // 5x5
+            push_back(convolution_random_test_all_params{
+                b, 16, 28, { 14, 14 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+            push_back(convolution_random_test_all_params{
+                b, 23, 41, { 19, 17 }, { 5, 5 }, { 1, 1 }, { -2, -2 }, { 1, 1 }, true, 1, input_format, asymm_weights, asymm_data });
+        }
+        return *this;
     }
 
-    VVVVF<OutputT> calculate_reference() override {
-        VVVVF<OutputT> expected = VVVVF<OutputT>(this->batch_num(), VVVF<OutputT>(this->output_features()));
-        for (size_t bi = 0; bi < this->batch_num(); ++bi)
-            for (size_t fi = 0; fi < this->output_features(); ++fi) {
-                auto bias = this->has_bias() ? this->_bias[fi] : static_cast<OutputT>(0);
-                expected[bi][fi] = reference_convolve<InputT, OutputT, WeightsT>(
-                    this->_input[bi],
-                    this->_weights[fi],
-                    this->_stride_y,
-                    this->_stride_x,
-                    static_cast<float>(bias),
-                    this->_dilation_y,
-                    this->_dilation_x,
-                    this->_offset_y,
-                    this->_offset_x,
-                    0,
-                    0,
-                    0,
-                    0,
-                    false,
-                    _data_zp,
-                    _weights_zp[fi]);
-            }
-        return expected;
+    params_generator& all_test_params(format::type input_format, bool asymm_weights = false, bool asymm_data = false) {
+        return smoke_test_params(input_format, asymm_weights, asymm_data)
+            .extra_test_params(input_format, asymm_weights, asymm_data);
     }
 
-    void param_set_up(const convolution_random_test_all_params& params) override {
-        parent::param_set_up(params);
-
-        _data_zp = generate_random_1d<InputT>(this->input_features(), -128, 128);
-        _weights_zp = generate_random_1d<WeightsT>(this->output_features(), -128, 128);
+    params_generator& add(convolution_random_test_all_params params) {
+        push_back(params);
+        return *this;
     }
-
-protected:
-    VF<InputT> _data_zp;
-    VF<WeightsT> _weights_zp;
 };
 
-using convolution_asymm_random_test_s8s8f32 = convolution_asymm_weights_data_random_test<int8_t, int8_t, float>;
-using convolution_asymm_random_test_u8s8f32 = convolution_asymm_weights_data_random_test<uint8_t, int8_t, float>;
-
-TEST_P(convolution_asymm_random_test_s8s8f32, random) {
-    ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
+TEST_P(convolution_random_smoke_test, u8s8f32) {
+    convolution_random_test_u8s8f32 test;
+    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
 }
 
-TEST_P(convolution_asymm_random_test_u8s8f32, random) {
-    ASSERT_NO_FATAL_FAILURE(run_random(convert_random_test_params(GetParam())));
+TEST_P(convolution_random_smoke_test, u8s8f32_scale) {
+    convolution_scale_random_test_u8s8f32 test;
+    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
 }
 
 INSTANTIATE_TEST_CASE_P(
-    basic_asymm,
-    convolution_asymm_random_test_s8s8f32,
-    testing::Combine(
-        testing::Values(1, 2),                                                             // batch
-        testing::Values(3, 32),                                                            // input features
-        testing::Values(16, 32),                                                           // output features
-        testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
-        testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
-        testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)),             // strides x, y
-        testing::Values(std::pair<int, int>(0, 0)),                                        // offsets x, y
-        testing::Values(std::pair<int, int>(1, 1)),                                        // dilation x, y
-        testing::Values(false, true),                                                      // bias
-        testing::Values(format::bfyx, format::b_fs_yx_fsv32)                               // input format
+    basic,
+    convolution_random_smoke_test,
+    testing::ValuesIn(
+        params_generator()
+        .smoke_test_params(format::b_fs_yx_fsv4)
+        .smoke_test_params(format::bfyx)
+        .smoke_test_params(format::b_fs_yx_fsv32)
+        .smoke_test_params(format::b_fs_yx_fsv32, true, true)
+        .smoke_test_params(format::b_fs_yx_fsv32, false, true)
+        .smoke_test_params(format::b_fs_yx_fsv32, true, false)
+        .smoke_test_params(format::b_fs_yx_fsv16)
     ),
-    to_string_convolution_random_params<convolution_random_test_params>);
+    to_string_convolution_all_params
+);
+
+class convolution_random_all_test : public testing::TestWithParam<convolution_random_test_all_params> {};
+
+TEST_P(convolution_random_all_test, u8s8f32) {
+    convolution_random_test_u8s8f32 test;
+    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
+
+TEST_P(convolution_random_all_test, s8s8f32) {
+    convolution_random_test_s8s8f32 test;
+    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
+
+TEST_P(convolution_random_all_test, u8s8f32_scale) {
+    convolution_scale_random_test_u8s8f32 test;
+    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
+
+TEST_P(convolution_random_all_test, s8s8f32_scale) {
+    convolution_scale_random_test_s8s8f32 test;
+    ASSERT_NO_FATAL_FAILURE(test.run_random(GetParam()));
+}
 
 INSTANTIATE_TEST_CASE_P(
-    basic_asymm,
-    convolution_asymm_random_test_u8s8f32,
-    testing::Combine(
-        testing::Values(1, 2),                                                             // batch
-        testing::Values(3, 32),                                                            // input features
-        testing::Values(16, 32),                                                           // output features
-        testing::Values(std::pair<size_t, size_t>(7, 7), std::pair<size_t, size_t>(8, 8)), // input x, y
-        testing::Values(std::pair<size_t, size_t>(1, 1), std::pair<size_t, size_t>(3, 3)), // filter x, y
-        testing::Values(std::pair<int, int>(1, 1), std::pair<int, int>(2, 2)),             // strides x, y
-        testing::Values(std::pair<int, int>(0, 0)),                                        // offsets x, y
-        testing::Values(std::pair<int, int>(1, 1)),                                        // dilation x, y
-        testing::Values(false, true),                                                      // bias
-        testing::Values(format::bfyx, format::b_fs_yx_fsv32)                               // input format
+    DISABLED_basic,
+    convolution_random_all_test,
+    testing::ValuesIn(
+        params_generator()
+        .all_test_params(format::bfyx)
+        .all_test_params(format::bfyx, true, true)
+        .all_test_params(format::bfyx, false, true)
+        .all_test_params(format::bfyx, true, false)
+        .all_test_params(format::b_fs_yx_fsv4)
+        // byxf_af32 - depthwise broken for batch > 1
+        // .smoke_test_params(format::byxf_af32)
+        .all_test_params(format::b_fs_yx_fsv32)
+        .all_test_params(format::b_fs_yx_fsv32, true, true)
+        .all_test_params(format::b_fs_yx_fsv32, false, true)
+        .all_test_params(format::b_fs_yx_fsv32, true, false)
+        .all_test_params(format::b_fs_yx_fsv16)
+        .add(convolution_random_test_all_params{
+            1, 89, 3, { 1, 1 }, { 3, 3 }, { 1, 1 }, { -1, -1 }, { 1, 1 }, true, 1, format::b_fs_yx_fsv4, false, false })
     ),
-    to_string_convolution_random_params<convolution_random_test_params>);
+    to_string_convolution_all_params
+);
 
 class convolution_test : public tests::generic_test
 {
index 522559e..3a1210e 100644 (file)
@@ -20,6 +20,7 @@
 #include "api/memory.hpp"
 #include <api/input_layout.hpp>
 #include "api/deconvolution.hpp"
+#include "api/crop.hpp"
 #include <api/data.hpp>
 #include <api/topology.hpp>
 #include <api/network.hpp>
@@ -1548,6 +1549,89 @@ TEST(deconvolution_f32_fw_gpu, basic3D_wsiz2x2x2_in1x1x2x2x2_stride2_pad1) {
 
 }
 
+TEST(deconvolution_f16_gpu, basic_k9x9_s2x2_pad4x4) {
+    //  Filter : 1x32x9x9
+    //  Input  : 1x32x16x16
+    //  Stride : 2x2
+    //  Pad    : 4x4
+
+    //const auto& engine = get_test_engine();
+    engine engine;
+
+    VVVVF<FLOAT16> input_rnd = generate_random_4d<FLOAT16>(1, 32, 16, 16, -2, 2);
+    VF<FLOAT16> input_rnd_vec = flatten_4d<FLOAT16>(format::bfyx, input_rnd);
+    VVVVF<FLOAT16> filter_rnd = generate_random_4d<FLOAT16>(1, 32, 9, 9, -1, 1);
+    VF<FLOAT16> filter_rnd_vec = flatten_4d<FLOAT16>(format::bfyx, filter_rnd);
+    VF<FLOAT16> bias_rnd = generate_random_1d<FLOAT16>(1, -1, 1);
+    VF<float> filter_rnd_f32_vec, bias_f32_rnd;
+
+    for (unsigned int i = 0; i < filter_rnd_vec.size(); i++)
+        filter_rnd_f32_vec.push_back(float(filter_rnd_vec[i]));
+
+    for (unsigned int i = 0; i < bias_rnd.size(); i++)
+        bias_f32_rnd.push_back(float(bias_rnd[i]));
+
+    auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 32, 16, 16 } });
+    auto weights = memory::allocate(engine, { data_types::f16, format::oiyx, { 1, 32, 9, 9 } });
+    auto biases = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 1, 1, 1 } });
+    auto weights_f32 = memory::allocate(engine, { data_types::f32, format::oiyx, { 1, 32, 9, 9 } });
+    auto biases_f32 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
+
+    set_values(input, input_rnd_vec);
+    set_values(weights, filter_rnd_vec);
+    set_values(biases, bias_rnd);
+    set_values(weights_f32, filter_rnd_f32_vec);
+    set_values(biases_f32, bias_f32_rnd);
+
+    topology topology_ref(
+        input_layout("input", input.get_layout()),
+        data("weights", weights),
+        data("biases", biases),
+        deconvolution("deconv", "input", { "weights" }, { "biases" }, { 1, 1, 2, 2 }, { 0, 0, -4, -4 }, tensor{ 1, 1, 32, 32 })
+    );
+
+    network network_ref(engine, topology_ref);
+    network_ref.set_input_data("input", input);
+
+    auto outputs_ref = network_ref.execute();
+    EXPECT_EQ(outputs_ref.size(), size_t(1));
+    EXPECT_EQ(outputs_ref.begin()->first, "deconv");
+    auto output_ref_prim = outputs_ref.begin()->second.get_memory();
+    auto output_ref_ptr = output_ref_prim.pointer<FLOAT16>();
+
+    std::vector<FLOAT16> output_vec_ref;
+    for (unsigned int i = 0; i < output_ref_prim.get_layout().count(); i++)
+    {
+        output_vec_ref.push_back(output_ref_ptr[i]);
+    }
+
+    topology topology_act(
+        input_layout("input_act", input.get_layout()),
+        data("weights_f32", weights_f32),
+        data("biases_f32", biases_f32),
+        deconvolution("deconv_act", "input_act", { "weights_f32" }, { "biases_f32" }, { 1, 1, 2, 2 }, { 0, 0, -4, -4 }),
+        reorder("out", "deconv_act", format::bfyx, data_types::f16)
+    );
+
+    cldnn::build_options options;
+    options.set_option(cldnn::build_option::optimize_data(true));
+    network network_act(engine, topology_act, options);
+    network_act.set_input_data("input_act", input);
+
+    auto outputs_act = network_act.execute();
+    EXPECT_EQ(outputs_act.size(), size_t(1));
+    EXPECT_EQ(outputs_act.begin()->first, "out");
+    auto output_act_prim = outputs_act.begin()->second.get_memory();
+    auto output_act_ptr = output_act_prim.pointer<FLOAT16>();
+
+    std::vector<float> output_vec;
+    for (unsigned int i = 0; i < output_act_prim.get_layout().count(); i++)
+    {
+        float x = float_round(output_act_ptr[i]), y = float_round(output_vec_ref[i]);
+        EXPECT_NEAR(x, y, 1e-0f);
+    }
+}
+
 TEST(deconvolution_f32_fw_gpu, basic_wsiz2x2_in2x2x1x2_b_fs_yx_fsv16_stride2_pad1) {
     //  Filter : 2x2
     //  Input  : 2x2x1x2
index cf435ea..4cc7997 100644 (file)
 #include <api/memory.hpp>
 #include <api/depth_to_space.hpp>
 #include <api/topology.hpp>
+#include <api/reshape.hpp>
 #include <api/network.hpp>
+#include "api/permute.hpp"
+#include "api/reorder.hpp"
 
 #include <cstddef>
 #include <tests/test_utils/test_utils.h>
@@ -211,42 +214,65 @@ TEST(depth_to_space_fp32_gpu, d1411_bs2) {
     }
 }
 
-TEST(depth_to_space_fp32_gpu, d1421_bs2) {
-    //  Input  : 1x4x2x1
+TEST(depth_to_space_fp32_gpu, d112960540_bs2) {
+    //  Input  : 1x12x960x540
     //  Block size : 2
-    //  Output : 1x1x4x2
-    //  Input values in fp32
+    //  Output : 1x3x1920x1080
+    //  Input values in fp16
 
     engine engine;
 
-    auto input1 = memory::allocate(engine, { data_types::f32, format::bfyx, { 1, 4, 1, 2 } });
+    auto input1 = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 12, 960, 540 } });
     size_t block_size = 2;
 
-    set_values(input1, {
-        0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f
-    });
+    auto random_input = generate_random_4d<FLOAT16>(1, 12, 540, 960, -1, 1);
+    auto input_rnd_vec = flatten_4d<FLOAT16>(format::bfyx, random_input);
+    set_values(input1, input_rnd_vec);
 
-    topology topology;
-    topology.add(input_layout("Input0", input1.get_layout()));
-    topology.add(
+    topology topology_act;
+    topology_act.add(input_layout("Input0", input1.get_layout()));
+    topology_act.add(
         depth_to_space("depth_to_space", "Input0", block_size)
     );
 
-    network network(engine, topology);
+    network network_act(engine, topology_act);
 
-    network.set_input_data("Input0", input1);
+    network_act.set_input_data("Input0", input1);
 
-    auto outputs = network.execute();
+    auto outputs = network_act.execute();
 
     auto output = outputs.at("depth_to_space").get_memory();
-    auto output_ptr = output.pointer<float>();
+    auto output_ptr = output.pointer<FLOAT16>();
 
-    std::vector<float> expected_results = {
-        0.f, 2.f, 4.f, 6.f, 1.f, 3.f, 5.f, 7.f
-    };
+    std::vector<uint16_t> perm = { 0,4,5,2,1,3 };
 
-    for (size_t i = 0; i < expected_results.size(); ++i) {
-        EXPECT_EQ(expected_results[i], output_ptr[i]);
+    topology topology_ref;
+    topology_ref.add(input_layout("Input0", input1.get_layout()));
+    topology_ref.add(reorder("reorder1", "Input0", { data_types::f16, format::bfwzyx, tensor{ batch(1), feature(12), spatial(1, 1, 960, 540) }
+        }));
+    topology_ref.add(
+        reshape("reshape", "reorder1", tensor{ batch(1), feature(2), spatial(960, 540, 3, 2) })
+    );
+    topology_ref.add(
+        permute("perm", "reshape", perm)
+    );
+    topology_ref.add(
+        reshape("reshape2", "perm", tensor(1, 3, 2 * 960, 2 * 540))
+    );
+
+    build_options build_opt;
+
+    build_opt.set_option(build_option::optimize_data(true));
+    network network_ref(engine, topology_ref, build_opt);
+    network_ref.set_input_data("Input0", input1);
+
+    auto outputs_ref = network_ref.execute();
+
+    auto output_ref = outputs_ref.at("reshape2").get_memory();
+    auto output_ptr_ref = output_ref.pointer<FLOAT16>();
+
+    for (size_t i = 0; i < output.get_layout().count(); ++i) {
+        EXPECT_EQ(output_ptr_ref[i], output_ptr[i]);
     }
 }
 
index 927bf53..9b750a1 100644 (file)
@@ -3316,6 +3316,7 @@ static std::vector<data_types> types = {data_types::f32, data_types::f16};
 static std::vector<std::vector<tensor>> inputs = {
         {{1, 2, 3, 4}, {1, 2, 3, 4}},
         {{1, 16, 8, 2}, {1, 16, 8, 2}},
+        {{1, 128, 16, 8}, {1, 1, 16, 8}},
         {{1, 32, 2, 2}, {1, 32, 2, 2}},
         {{8, 32, 4, 5}, {8, 32, 4, 5}},
         {{1, 2, 3, 4}, {1, 2, 1, 1}},
index 0171607..d6ff3ab 100644 (file)
@@ -26,6 +26,7 @@
 #include <api/engine.hpp>
 #include "test_utils/test_utils.h"
 #include <api/data.hpp>
+#include <api/depth_to_space.hpp>
 
 #include <api_extension/fused_conv_eltwise.hpp>
 
@@ -76,6 +77,77 @@ TEST(fused_conv_eltwise, basic_0)
     EXPECT_EQ(out_layout.size.spatial[1], 5);
 }
 
+TEST(fused_conv_eltwise, basic_image2d)
+{
+    const auto& engine = get_test_engine();
+
+    auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 4, 128, 2 } });
+    auto input2 = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 3, 256, 4 } });
+    auto weights = memory::allocate(engine, { data_types::f16, format::bfyx, { 12, 4, 1, 1 } });
+
+    auto input_data1 = generate_random_4d<FLOAT16>(1, 4, 2, 128, -1, 1);
+    auto input_data1_bfyx = flatten_4d(format::bfyx, input_data1);
+    set_values(input, input_data1_bfyx);
+
+    auto input_data2 = generate_random_4d<FLOAT16>(1, 3, 4, 256, -1, 1);
+    auto input_data2_bfyx = flatten_4d(format::bfyx, input_data2);
+    set_values(input2, input_data2_bfyx);
+
+    auto weights_data= generate_random_4d<FLOAT16>(12, 4, 1, 1, -1, 1);
+    auto weights_data_bfyx = flatten_4d(format::bfyx, weights_data);
+    set_values(weights, weights_data_bfyx);
+
+    topology topology_act(
+        input_layout("input", input.get_layout()),
+        input_layout("input2", input2.get_layout()),
+        data("weights", weights),
+        convolution("conv", "input", { "weights" }),
+        depth_to_space("depth_to_space", "conv", 2),
+        eltwise("eltwise", "input2", "depth_to_space", eltwise_mode::sum)
+    );
+
+    build_options opt_act;
+    opt_act.set_option(build_option::optimize_data(true));
+    network network_act(engine, topology_act, opt_act);
+    network_act.set_input_data("input", input);
+    network_act.set_input_data("input2", input2);
+
+    auto outputs_act = network_act.execute();
+    EXPECT_EQ(outputs_act.size(), size_t(1));
+    EXPECT_EQ(outputs_act.begin()->first, "eltwise");
+
+    auto output_act = outputs_act.begin()->second.get_memory();
+    auto&& out_act_layout = output_act.get_layout();
+    auto out_act_ptr = output_act.pointer<uint8_t>();
+
+    topology topology_ref(
+        input_layout("input", input.get_layout()),
+        input_layout("input2", input2.get_layout()),
+        data("weights", weights),
+        convolution("conv", "input", { "weights" }),
+        depth_to_space("depth_to_space", "conv", 2),
+        eltwise("eltwise", "input2", "depth_to_space", eltwise_mode::sum),
+        reorder("out", "eltwise", format::image_2d_rgba, data_types::u8));
+
+    build_options opt_ref;
+    opt_ref.set_option(build_option::optimize_data(false));
+    network network_ref(engine, topology_ref, opt_ref);
+    network_ref.set_input_data("input", input);
+    network_ref.set_input_data("input2", input2);
+
+    auto outputs_ref = network_ref.execute();
+    EXPECT_EQ(outputs_ref.size(), size_t(1));
+    EXPECT_EQ(outputs_ref.begin()->first, "out");
+
+    auto output_ref = outputs_ref.begin()->second.get_memory();
+    auto&& out_ref_layout = output_ref.get_layout();
+    auto out_ref_ptr = output_ref.pointer<uint8_t>();
+
+    for (int i = 0;i < 3 * 256 * 4;i++) {
+        EXPECT_EQ(out_act_ptr[i], out_ref_ptr[i]);
+    }
+}
+
 TEST(fused_conv_eltwise, dont_fuse_if_conv_elt_are_outputs)
 {
     const auto& engine = get_test_engine();
index e5cd5db..3ee79aa 100644 (file)
@@ -87,7 +87,8 @@ template<typename T>
 class BaseFusingTest : public ::testing::TestWithParam<T> {
 public:
     cldnn::engine engine;
-    cldnn::topology topology;
+    cldnn::topology topology_fused;
+    cldnn::topology topology_non_fused;
     cldnn::build_options bo_fused;
     cldnn::build_options bo_not_fused;
 
@@ -110,7 +111,13 @@ public:
             size_t count = 0;
             for (auto& pi : net.get_primitives_info()) {
                 if (pi.type_id == "reorder") {
-                    count++;
+                    auto exec_prims = net.get_executed_primitives();
+                    auto it = std::find_if(exec_prims.begin(), exec_prims.end(), [&](const std::pair<primitive_id, event>& e) -> bool {
+                        return e.first == pi.original_id;
+                    });
+                    // We count executed reorders only
+                    if (it != exec_prims.end())
+                        count++;
                 }
             }
             return count;
@@ -238,6 +245,12 @@ public:
     layout get_single_element_layout(T& p) {
         return layout{ p.default_type, p.default_format, tensor{1, 1, 1, 1} };
     }
+
+    template <class... Args>
+    void create_topologies(Args const&... args) {
+        topology_fused.add(args...);
+        topology_non_fused.add(args...);
+    }
 };
 
 class WeightsPrimitiveFusingTest : public ::BaseFusingTest<bc_test_params> {
@@ -245,8 +258,8 @@ public:
 
     void execute(bc_test_params& p) {
         auto input_prim = get_mem(get_input_layout(p));
-        network network_not_fused(this->engine, this->topology, bo_not_fused);
-        network network_fused(this->engine, this->topology, bo_fused);
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
         network_fused.set_input_data("input", input_prim);
         network_not_fused.set_input_data("input", input_prim);
 
@@ -269,8 +282,8 @@ public:
 
     void execute(resample_test_params& p) {
         auto input_prim = get_mem(get_input_layout(p));
-        network network_not_fused(this->engine, this->topology, bo_not_fused);
-        network network_fused(this->engine, this->topology, bo_fused);
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
         network_fused.set_input_data("input", input_prim);
         network_not_fused.set_input_data("input", input_prim);
 
@@ -293,8 +306,8 @@ public:
         auto input0_prim = get_mem(get_input_layout(p, 0));
         auto input1_prim = get_mem(get_input_layout(p, 1));
 
-        network network_not_fused(this->engine, this->topology, bo_not_fused);
-        network network_fused(this->engine, this->topology, bo_fused);
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
         network_fused.set_input_data("input0", input0_prim);
         network_not_fused.set_input_data("input0", input0_prim);
         network_fused.set_input_data("input1", input1_prim);
@@ -357,6 +370,7 @@ public:
 #define CASE_CONV_U8S8_4 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
 #define CASE_CONV_U8S8_5 {1, 16, 5, 5}, {1, 32, 5, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_CONV_U8S8_6 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
+#define CASE_CONV_U8S8_7 {1, 64, 7, 7}, {1, 32, 7, 7}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
 
 #define CASE_CONV_S8S8_1 {1, 15, 4, 5}, {1, 30, 2, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_CONV_S8S8_2 {1, 15, 5, 5}, {1, 30, 3, 3}, {1, 1, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
@@ -364,6 +378,7 @@ public:
 #define CASE_CONV_S8S8_4 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
 #define CASE_CONV_S8S8_5 {1, 16, 5, 5}, {1, 32, 5, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
 #define CASE_CONV_S8S8_6 {1, 17, 4, 5}, {1, 17, 4, 5}, {1, 1, 1, 1}, tensor{1}, tensor{0}, tensor{1}, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx
+#define CASE_CONV_S8S8_7  {1, 64, 7, 7}, {1, 32, 7, 7}, {1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, tensor{1}, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx
 
 #define CASE_CONV3D_U8S8_1 {1, 15, 5, 4, 5}, {1, 30, 3, 2, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
 #define CASE_CONV3D_U8S8_2 {1, 15, 5, 5, 5}, {1, 30, 3, 3, 3}, {1, 1, 3, 3, 3}, tensor{1}, tensor{0}, tensor{1}, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx
@@ -406,7 +421,7 @@ public:
 class conv_fp32_activation : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_activation, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
@@ -433,7 +448,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_activation, ::testing::ValuesIn(s
 class conv_fp32_scale : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_scale, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
@@ -464,7 +479,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_scale,
 class conv_fp32_prelu_eltwise : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_prelu_eltwise, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("slope_data", get_mem(get_per_channel_layout(p))),
@@ -481,7 +496,7 @@ TEST_P(conv_fp32_prelu_eltwise, basic) {
 
 TEST_P(conv_fp32_prelu_eltwise, vector_ops) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("slope_data", get_mem(get_per_channel_layout(p))),
@@ -502,7 +517,7 @@ TEST_P(conv_fp32_prelu_eltwise, vector_ops) {
 TEST_P(conv_fp32_prelu_eltwise, vector_ops_mixed_types) {
     auto p = GetParam();
     auto slope_type = p.default_type == data_types::f32 ? data_types::f16 : data_types::f32;
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("slope_data", get_mem(layout{ slope_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} })),
@@ -537,7 +552,7 @@ class conv_fp32_eltwise_b_fs_zyx_fsv16 : public WeightsPrimitiveFusingTest {};
 
 TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, vector_ops) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("eltwise_data", get_mem(get_output_layout(p))),
@@ -556,7 +571,7 @@ TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, vector_ops) {
 class conv_fp32_swish : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_swish, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  convolution("conv_prim", "input", {"weights"}, {"bias"}, p.groups, p.stride, p.pad, p.dilation),
@@ -587,11 +602,11 @@ TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, splitted_vector_ops) {
 
     std::vector<std::string> weights_idx;
     for (size_t w = 0; w < p.groups; w++) {
-        topology.add(data("weights" + std::to_string(w), get_mem(get_weights_layout(p, p.groups))));
+        create_topologies(data("weights" + std::to_string(w), get_mem(get_weights_layout(p, p.groups))));
         weights_idx.push_back(("weights" + std::to_string(w)));
     }
 
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("eltwise_data", get_mem(get_output_layout(p))),
                  convolution("conv_prim", "input", weights_idx, {}, 1, p.stride, p.pad, p.dilation),
                  eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum),
@@ -626,7 +641,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_eltwise_b_fs_zyx_fsv16,
 class conv_fp32_quantize_u8 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_quantize_u8, DISABLED_basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -650,7 +665,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_quantize_u8,
 class conv_fp32_scale_quantize_i8 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_scale_quantize_i8, DISABLED_basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -678,7 +693,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_scale_quantize_i8,
 class conv_fp32_scale_activation_quantize_i8 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_scale_activation_quantize_i8, DISABLED_basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -705,7 +720,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8,
 class conv_fp32_scale_activation_quantize_i8_eltwise_fp32 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_scale_activation_quantize_i8_eltwise_fp32, DISABLED_basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -733,7 +748,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8_eltw
 class conv_fp32_scale_activation_quantize_i8_activation : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_scale_activation_quantize_i8_activation, DISABLED_basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -762,7 +777,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8_acti
 class conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, DISABLED_basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -800,7 +815,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8_eltw
 class conv_bin_activation : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_bin_activation, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p), -127, 127)),
                  binary_convolution("bin_conv_prim", "input", {"weights"}, p.stride, p.pad, p.dilation, p.out_shape, p.groups),
                  activation("activation", "bin_conv_prim", activation_func::relu),
@@ -819,7 +834,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_bin_activation,
 class conv_bin_scale_activation : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_bin_scale_activation, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p), -127, 127)),
                  data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
                  binary_convolution("bin_conv_prim", "input", {"weights"}, p.stride, p.pad, p.dilation, p.out_shape, p.groups),
@@ -841,7 +856,7 @@ class conv_bin_quantize_bin : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_bin_quantize_bin, channel_wise_quantize) {
     auto p = GetParam();
     auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random);
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p), -127, 127)),
                  data("in_lo", in_thresh),
                  data("in_hi", in_thresh),
@@ -858,7 +873,7 @@ TEST_P(conv_bin_quantize_bin, channel_wise_quantize) {
 TEST_P(conv_bin_quantize_bin, blob_wise_quantize) {
     auto p = GetParam();
     auto in_thresh = get_mem(get_single_element_layout(p), min_random, max_random);
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p), -127, 127)),
                  data("in_lo", in_thresh),
                  data("in_hi", in_thresh),
@@ -885,7 +900,7 @@ TEST_P(conv_bin_scale_conv_dw, dw_kernel_3x3_stride2) {
     auto dw_weights_layout = layout{p.default_type, format::goiyx, dw_tensor};
 
     auto dw_stride = tensor{1, 1, 2, 2};
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p), -127, 127)),
                  data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
                  data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)),
@@ -904,7 +919,7 @@ TEST_P(conv_bin_scale_conv_dw, dw_kernel_3x3_stride1) {
     auto dw_weights_layout = layout{p.default_type, format::goiyx, dw_tensor};
 
     auto dw_stride = tensor{1, 1, 1, 1};
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p), -127, 127)),
                  data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
                  data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)),
@@ -931,7 +946,7 @@ TEST_P(conv_bin_scale_conv_dw_prelu, dw_kernel_3x3_stride2) {
 
     auto dw_stride = tensor{1, 1, 2, 2};
     auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random);
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p), -127, 127)),
                  data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
                  data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)),
@@ -953,7 +968,7 @@ TEST_P(conv_bin_scale_conv_dw_prelu, dw_kernel_3x3_stride1) {
 
     auto dw_stride = tensor{1, 1, 1, 1};
     auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random);
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p), -127, 127)),
                  data("weights_dw", get_mem(dw_weights_layout, -127, 127)),
                  data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)),
@@ -981,7 +996,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_bin_scale_conv_dw_prelu,
 class conv_int8_scale : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_scale, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
@@ -1018,7 +1033,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale,
 class conv_int8_scale_shift_swish : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_scale_shift_swish, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())),
@@ -1060,7 +1075,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_shift_swish,
 class conv_int8_byxf_af32 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_byxf_af32, per_channel_coeffs) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)),
@@ -1078,7 +1093,7 @@ TEST_P(conv_int8_byxf_af32, per_channel_coeffs) {
 
 TEST_P(conv_int8_byxf_af32, per_element_coeffs) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("eltwise_data", get_mem(get_output_layout(p))),
@@ -1111,7 +1126,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_byxf_af32,
 class conv_int8_prelu_eltwise : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_prelu_eltwise, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("slope_data", get_mem(get_per_channel_layout(p))),
@@ -1126,16 +1141,43 @@ TEST_P(conv_int8_prelu_eltwise, basic) {
     execute(p);
 }
 
+TEST_P(conv_int8_prelu_eltwise, fsv16) {
+    auto p = GetParam();
+    create_topologies(input_layout("input", get_input_layout(p)),
+                 data("weights", get_mem(get_weights_layout(p))),
+                 data("bias", get_mem(get_bias_layout(p))),
+                 data("slope_data", get_mem(get_per_channel_layout(p))),
+                 data("eltwise_data", get_mem(get_output_layout(p))),
+                 convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation),
+                 activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope),
+                 eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum),
+                 reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32)
+    );
+
+    if (p.default_format.dimension() == 4) {
+        implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" };
+        bo_fused.set_option(build_option::force_implementations({ {"conv_prim", conv_impl} }));
+    } else {
+        // TODO Add 5D int8 optimized convolution implementations
+        return;
+    }
+
+    tolerance = 1e-5f;
+    execute(p);
+}
+
 INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_prelu_eltwise,
                         ::testing::ValuesIn(std::vector<bc_test_params>{
                                 bc_test_params{CASE_CONV_U8S8_1, 2, 4},
                                 bc_test_params{CASE_CONV_U8S8_2, 2, 4},
                                 bc_test_params{CASE_CONV_U8S8_3, 2, 4},
                                 bc_test_params{CASE_CONV_U8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV_U8S8_7, 2, 4},
                                 bc_test_params{CASE_CONV_S8S8_1, 2, 4},
                                 bc_test_params{CASE_CONV_S8S8_2, 2, 4},
                                 bc_test_params{CASE_CONV_S8S8_3, 2, 4},
                                 bc_test_params{CASE_CONV_S8S8_4, 2, 4},
+                                bc_test_params{CASE_CONV_S8S8_7, 2, 4},
 
                                 bc_test_params{CASE_CONV3D_U8S8_1, 2, 4},
                                 bc_test_params{CASE_CONV3D_U8S8_2, 2, 4},
@@ -1150,7 +1192,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_prelu_eltwise,
 class conv_int8_quantize_u8 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_quantize_u8, per_channel) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1168,7 +1210,7 @@ TEST_P(conv_int8_quantize_u8, per_channel) {
 
 TEST_P(conv_int8_quantize_u8, per_tensor) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_single_element_layout(p), -10)),
@@ -1208,7 +1250,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_quantize_u8,
 class conv_int8_scale_quantize_i8 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_scale_quantize_i8, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1252,7 +1294,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_quantize_i8,
 class conv_int8_scale_activation_quantize_i8 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_scale_activation_quantize_i8, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1295,7 +1337,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8,
 class conv_int8_scale_activation_quantize_i8_eltwise_fp32 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_scale_activation_quantize_i8_eltwise_fp32, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1339,7 +1381,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_eltw
 class conv_int8_scale_activation_quantize_i8_activation : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_scale_activation_quantize_i8_activation, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1384,7 +1426,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_acti
 class conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1433,7 +1475,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_eltw
 class conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec : public WeightsPrimitiveFusingTest {};
 TEST_P(conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, vector_ops) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1465,7 +1507,7 @@ TEST_P(conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, vector_op
 
 TEST_P(conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, vector_ops_mixed_types) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(get_weights_layout(p))),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1509,7 +1551,7 @@ TEST_P(conv_int8_asymmetric_weights, basic) {
     auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx;
     auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) :
                                            get_weights_layout(p);
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(weights_layout)),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("w_zp", get_mem(get_weights_zp_layout(p), 1, 127)),
@@ -1520,16 +1562,17 @@ TEST_P(conv_int8_asymmetric_weights, basic) {
     tolerance = 1.f;
 
     auto input_prim = get_mem(get_input_layout(p));
-    network network_not_fused(this->engine, this->topology, bo_not_fused);
-    network network_fused(this->engine, this->topology, bo_fused);
+    network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+    network network_fused(this->engine, this->topology_fused, bo_fused);
     network_fused.set_input_data("input", input_prim);
     network_not_fused.set_input_data("input", input_prim);
 
     ASSERT_FALSE(network_fused.get_primitives_info().empty());
     ASSERT_FALSE(network_not_fused.get_primitives_info().empty());
 
+    // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder
     auto find_conv = [](primitive_info& p) -> bool {
-        if (p.original_id == "conv_prim")
+        if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx")
             return true;
         return false;
     };
@@ -1575,7 +1618,7 @@ TEST_P(conv_int8_asymmetric_data, basic) {
     auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx;
     auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) :
                           get_weights_layout(p);
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(weights_layout)),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("a_zp", get_mem(get_activations_zp_layout(p), 1, 127)),
@@ -1586,16 +1629,17 @@ TEST_P(conv_int8_asymmetric_data, basic) {
     tolerance = 1.f;
 
     auto input_prim = get_mem(get_input_layout(p));
-    network network_not_fused(this->engine, this->topology, bo_not_fused);
-    network network_fused(this->engine, this->topology, bo_fused);
+    network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+    network network_fused(this->engine, this->topology_fused, bo_fused);
     network_fused.set_input_data("input", input_prim);
     network_not_fused.set_input_data("input", input_prim);
 
     ASSERT_FALSE(network_fused.get_primitives_info().empty());
     ASSERT_FALSE(network_not_fused.get_primitives_info().empty());
 
+    // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder
     auto find_conv = [](primitive_info& p) -> bool {
-        if (p.original_id == "conv_prim")
+        if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx")
             return true;
         return false;
     };
@@ -1641,7 +1685,7 @@ TEST_P(conv_int8_asymmetric_data_and_weights, basic) {
     auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx;
     auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) :
                           get_weights_layout(p);
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                  data("weights", get_mem(weights_layout)),
                  data("bias", get_mem(get_bias_layout(p))),
                  data("a_zp", get_mem(get_activations_zp_layout(p), 1, 127)),
@@ -1654,16 +1698,17 @@ TEST_P(conv_int8_asymmetric_data_and_weights, basic) {
     tolerance = 1.f;
 
     auto input_prim = get_mem(get_input_layout(p));
-    network network_not_fused(this->engine, this->topology, bo_not_fused);
-    network network_fused(this->engine, this->topology, bo_fused);
+    network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+    network network_fused(this->engine, this->topology_fused, bo_fused);
     network_fused.set_input_data("input", input_prim);
     network_not_fused.set_input_data("input", input_prim);
 
     ASSERT_FALSE(network_fused.get_primitives_info().empty());
     ASSERT_FALSE(network_not_fused.get_primitives_info().empty());
 
+    // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder
     auto find_conv = [](primitive_info& p) -> bool {
-        if (p.original_id == "conv_prim")
+        if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx")
             return true;
         return false;
     };
@@ -1709,7 +1754,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, conv_int8_asymmetric_data_and_weights,
 class fc_fp32_activation : public WeightsPrimitiveFusingTest {};
 TEST_P(fc_fp32_activation, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
                 data("weights", get_mem(get_weights_layout(p))),
                 data("bias", get_mem(get_bias_layout(p))),
                 fully_connected("fc_prim", "input", "weights", "bias"),
@@ -1730,7 +1775,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_fp32_activation, ::testing::ValuesIn(std
 class fc_int8_scale : public WeightsPrimitiveFusingTest {};
 TEST_P(fc_int8_scale, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
         data("weights", get_mem(get_weights_layout(p))),
         data("bias", get_mem(get_bias_layout(p))),
         data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())),
@@ -1753,7 +1798,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale,
 class fc_int8_quantize_u8 : public WeightsPrimitiveFusingTest {};
 TEST_P(fc_int8_quantize_u8, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
         data("weights", get_mem(get_weights_layout(p))),
         data("bias", get_mem(get_bias_layout(p))),
         data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1779,7 +1824,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu_fc, fc_int8_quantize_u8,
 class fc_int8_scale_quantize_i8 : public WeightsPrimitiveFusingTest {};
 TEST_P(fc_int8_scale_quantize_i8, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
         data("weights", get_mem(get_weights_layout(p))),
         data("bias", get_mem(get_bias_layout(p))),
         data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1808,7 +1853,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale_quantize_i8,
 class fc_int8_scale_activation_quantize_i8 : public WeightsPrimitiveFusingTest {};
 TEST_P(fc_int8_scale_activation_quantize_i8, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
         data("weights", get_mem(get_weights_layout(p))),
         data("bias", get_mem(get_bias_layout(p))),
         data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1837,7 +1882,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, fc_int8_scale_activation_quantize_i8,
 class gemm_int8_3in_quantize_i8 : public GemmFusingTest {};
 TEST_P(gemm_int8_3in_quantize_i8, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input0", get_input_layout(p, 0)),
+    create_topologies(input_layout("input0", get_input_layout(p, 0)),
         input_layout("input1", get_input_layout(p, 1)),
         input_layout("input2", get_input_layout(p, 2)),
         data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
@@ -1863,7 +1908,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_int8_3in_quantize_i8,
 class gemm_int8_2in_quantize_u8 : public GemmFusingTest {};
 TEST_P(gemm_int8_2in_quantize_u8, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input0", get_input_layout(p, 0)),
+    create_topologies(input_layout("input0", get_input_layout(p, 0)),
         input_layout("input1", get_input_layout(p, 1)),
         data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
         data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
@@ -1888,7 +1933,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_int8_2in_quantize_u8,
 class gemm_int8_2in_act_scale_quantize_i8 : public GemmFusingTest {};
 TEST_P(gemm_int8_2in_act_scale_quantize_i8, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input0", get_input_layout(p, 0)),
+    create_topologies(input_layout("input0", get_input_layout(p, 0)),
         input_layout("input1", get_input_layout(p, 1)),
         data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
         data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
@@ -1939,7 +1984,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, gemm_int8_2in_act_scale_quantize_i8,
 class resample_quantize : public ResamplePrimitiveFusingTest {};
 TEST_P(resample_quantize, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
         data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
         data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
         data("out_lo", get_mem(get_single_element_layout(p), -127)),
@@ -1980,7 +2025,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, resample_quantize,
 class resample_scale_activation : public ResamplePrimitiveFusingTest {};
 TEST_P(resample_scale_activation, basic) {
     auto p = GetParam();
-    topology.add(input_layout("input", get_input_layout(p)),
+    create_topologies(input_layout("input", get_input_layout(p)),
         data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)),
         resample("resample_prim", "input", p.out_shape, p.in_shape.feature[0], p.type),
         scale("scale", "resample_prim", "scale_data"),
@@ -2041,16 +2086,24 @@ struct mvn_test_params {
 #define CASE_MVN_3D_F16_2   {2, 16, 8, 8, 8}, data_types::f16, format::bfzyx, true, true, data_types::f16, format::bfzyx
 #define CASE_MVN_I8_1       {1, 16, 8, 8},    data_types::i8, format::bfyx, false, true, data_types::f32, format::bfyx
 #define CASE_MVN_I8_2       {2, 16, 8, 8},    data_types::i8, format::bfyx, true, true, data_types::f32, format::bfyx
+#define CASE_MVN_I8_3       {1, 16, 8, 8},    data_types::i8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx
+#define CASE_MVN_I8_4       {2, 16, 8, 8},    data_types::i8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx
 #define CASE_MVN_3D_I8_1    {1, 16, 8, 8, 8}, data_types::i8, format::bfzyx, false, true, data_types::f32, format::bfzyx
 #define CASE_MVN_3D_I8_2    {2, 16, 8, 8, 8}, data_types::i8, format::bfzyx, true, true, data_types::f32, format::bfzyx
+#define CASE_MVN_U8_1       {1, 16, 8, 8},    data_types::u8, format::bfyx, false, true, data_types::f32, format::bfyx
+#define CASE_MVN_U8_2       {2, 16, 8, 8},    data_types::u8, format::bfyx, true, true, data_types::f32, format::bfyx
+#define CASE_MVN_U8_3       {1, 16, 8, 8},    data_types::u8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx
+#define CASE_MVN_U8_4       {2, 16, 8, 8},    data_types::u8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx
+#define CASE_MVN_3D_U8_1    {1, 16, 8, 8, 8}, data_types::u8, format::bfzyx, false, true, data_types::f32, format::bfzyx
+#define CASE_MVN_3D_U8_2    {2, 16, 8, 8, 8}, data_types::u8, format::bfzyx, true, true, data_types::f32, format::bfzyx
 
 class MVNFusingTest : public ::BaseFusingTest<mvn_test_params> {
 public:
     void execute(mvn_test_params& p) {
         auto input_prim = get_mem(get_input_layout(p));
 
-        network network_not_fused(this->engine, this->topology, bo_not_fused);
-        network network_fused(this->engine, this->topology, bo_fused);
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
 
         network_fused.set_input_data("input", input_prim);
         network_not_fused.set_input_data("input", input_prim);
@@ -2070,7 +2123,7 @@ public:
 class mvn_activation : public MVNFusingTest {};
 TEST_P(mvn_activation, basic) {
     auto p = GetParam();
-    topology.add(
+    create_topologies(
         input_layout("input", get_input_layout(p)),
         mvn("mvn", "input", false, p.normalize_variance),
         activation("act", "mvn", activation_func::hyperbolic_tan),
@@ -2093,14 +2146,22 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, mvn_activation,
                         mvn_test_params{ CASE_MVN_3D_F16_2, 2, 3 },
                         mvn_test_params{ CASE_MVN_I8_1, 2, 3 },
                         mvn_test_params{ CASE_MVN_I8_2, 2, 3 },
+                        mvn_test_params{ CASE_MVN_I8_3, 2, 3 },
+                        mvn_test_params{ CASE_MVN_I8_4, 2, 3 },
                         mvn_test_params{ CASE_MVN_3D_I8_1, 2, 3 },
                         mvn_test_params{ CASE_MVN_3D_I8_2, 2, 3 },
+                        mvn_test_params{ CASE_MVN_U8_1, 2, 3 },
+                        mvn_test_params{ CASE_MVN_U8_2, 2, 3 },
+                        mvn_test_params{ CASE_MVN_U8_3, 2, 3 },
+                        mvn_test_params{ CASE_MVN_U8_4, 2, 3 },
+                        mvn_test_params{ CASE_MVN_3D_U8_1, 2, 3 },
+                        mvn_test_params{ CASE_MVN_3D_U8_2, 2, 3 },
 }), );
 
 class mvn_scale_quantize_i8 : public MVNFusingTest {};
 TEST_P(mvn_scale_quantize_i8, basic) {
     auto p = GetParam();
-    topology.add(
+    create_topologies(
         input_layout("input", get_input_layout(p)),
         mvn("mvn", "input", false, p.normalize_variance),
         data("scale_data", get_mem(get_per_channel_layout(p))),
@@ -2130,14 +2191,22 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, mvn_scale_quantize_i8,
         // mvn_test_params{ CASE_MVN_3D_F16_2, 2, 4 },
         mvn_test_params{ CASE_MVN_I8_1, 2, 4 },
         mvn_test_params{ CASE_MVN_I8_2, 2, 4 },
+        mvn_test_params{ CASE_MVN_I8_3, 2, 4 },
+        mvn_test_params{ CASE_MVN_I8_4, 2, 4 },
         mvn_test_params{ CASE_MVN_3D_I8_1, 2, 4 },
         mvn_test_params{ CASE_MVN_3D_I8_2, 2, 4 },
+        mvn_test_params{ CASE_MVN_U8_1, 2, 4 },
+        mvn_test_params{ CASE_MVN_U8_2, 2, 4 },
+        mvn_test_params{ CASE_MVN_U8_3, 2, 4 },
+        mvn_test_params{ CASE_MVN_U8_4, 2, 4 },
+        mvn_test_params{ CASE_MVN_3D_U8_1, 2, 4 },
+        mvn_test_params{ CASE_MVN_3D_U8_2, 2, 4 },
 }), );
 
 class mvn_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public MVNFusingTest {};
 TEST_P(mvn_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, basic) {
     auto p = GetParam();
-    topology.add(
+    create_topologies(
         input_layout("input", get_input_layout(p)),
         mvn("mvn", "input", false, p.normalize_variance),
         data("scale_data", get_mem(get_per_channel_layout(p))),
@@ -2175,8 +2244,16 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, mvn_scale_activation_quantize_i8_eltwise_fp
         // mvn_test_params{ CASE_MVN_3D_F16_2, 2, 7 },
         mvn_test_params{ CASE_MVN_I8_1, 2, 7 },
         mvn_test_params{ CASE_MVN_I8_2, 2, 7 },
+        mvn_test_params{ CASE_MVN_I8_3, 2, 7 },
+        mvn_test_params{ CASE_MVN_I8_4, 2, 7 },
         mvn_test_params{ CASE_MVN_3D_I8_1, 2, 7 },
         mvn_test_params{ CASE_MVN_3D_I8_2, 2, 7 },
+        mvn_test_params{ CASE_MVN_U8_1, 2, 7 },
+        mvn_test_params{ CASE_MVN_U8_2, 2, 7 },
+        mvn_test_params{ CASE_MVN_U8_3, 2, 7 },
+        mvn_test_params{ CASE_MVN_U8_4, 2, 7 },
+        mvn_test_params{ CASE_MVN_3D_U8_1, 2, 7 },
+        mvn_test_params{ CASE_MVN_3D_U8_2, 2, 7 },
 }), );
 
 
@@ -2216,8 +2293,8 @@ public:
     void execute(pooling_test_params& p) {
         auto input_prim = get_mem(get_input_layout(p));
 
-        network network_not_fused(this->engine, this->topology, bo_not_fused);
-        network network_fused(this->engine, this->topology, bo_fused);
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, bo_fused);
 
         network_fused.set_input_data("input", input_prim);
         network_not_fused.set_input_data("input", input_prim);
@@ -2237,7 +2314,7 @@ public:
 class pooling_activation : public PoolingFusingTest {};
 TEST_P(pooling_activation, basic) {
     auto p = GetParam();
-    topology.add(
+    create_topologies(
         input_layout("input", get_input_layout(p)),
         pooling("pooling", "input", p.mode, p.kernel_size, p.stride, p.offset),
         activation("act", "pooling", activation_func::relu),
@@ -2265,7 +2342,7 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_activation,
 class pooling_scale : public PoolingFusingTest {};
 TEST_P(pooling_scale, basic) {
     auto p = GetParam();
-    topology.add(
+    create_topologies(
         input_layout("input", get_input_layout(p)),
         data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel_size.count())),
         pooling("pooling", "input", p.mode, p.kernel_size, p.stride, p.offset),
index d5f73b9..35c9ba8 100644 (file)
@@ -36,18 +36,13 @@ class mvn_gpu_test : public ::testing::TestWithParam<cldnn::format>
 };
 
 template <typename T>
-void mvn_compute_mean_accross_channels_bfyx(cldnn::memory &output, bool normalize_variance)
-{
-    using namespace tests;
-
-    const auto output_desc = generic_test::get_linear_memory_desc(output.get_layout());
+void mvn_compute_mean_accross_channels(cldnn::memory &output, bool normalize_variance) {
+    auto output_size = output.get_layout().size;
 
-    auto output_sizes = output.get_layout().size.sizes();
-
-    uint32_t batch_size = output_sizes[0];
-    uint32_t feature_size = output_sizes[1];
-    uint32_t y_size = output_sizes[3];
-    uint32_t x_size = output_sizes[2];
+    uint32_t batch_size = output_size.batch[0];
+    uint32_t feature_size = output_size.feature[0];
+    uint32_t y_size = output_size.spatial[1];
+    uint32_t x_size = output_size.spatial[0];
 
     auto buff = output.pointer<T>();
 
@@ -63,7 +58,8 @@ void mvn_compute_mean_accross_channels_bfyx(cldnn::memory &output, bool normaliz
             {
                 for (uint32_t x = 0; x < x_size; ++x)
                 {
-                    size_t data_index = generic_test::get_linear_index(output.get_layout(), b, f, y, x, output_desc);
+                    auto index_tensor = tensor(batch(b), feature(f), spatial(x, y, 0, 0));
+                    size_t data_index = output.get_layout().get_linear_offset(index_tensor);
                     float data = static_cast<float>(buff[data_index]);
                     sum += data;
                     if (normalize_variance)
@@ -73,30 +69,25 @@ void mvn_compute_mean_accross_channels_bfyx(cldnn::memory &output, bool normaliz
         }
         sum /= feature_size * y_size * x_size;
         T result_sum = static_cast<T>(sum);
-        EXPECT_NEAR(result_sum, 0.f, err_margin);
+        EXPECT_NEAR(result_sum, 0.f, err_margin) << "at b=" << b;
 
         if (normalize_variance)
         {
             variance /= feature_size * y_size * x_size;
             T result_variance = static_cast<T>(variance);
-            EXPECT_NEAR(result_variance, 1.f, err_margin);
+            EXPECT_NEAR(result_variance, 1.f, err_margin) << " at b=" << b;
         }
     }
 }
 
 template <typename T>
-void mvn_compute_mean_within_channels_bfyx(cldnn::memory &output, bool normalize_variance)
-{
-    using namespace tests;
+void mvn_compute_mean_within_channels(cldnn::memory &output, bool normalize_variance) {
+    auto output_size = output.get_layout().size;
 
-    const auto output_desc = generic_test::get_linear_memory_desc(output.get_layout());
-
-    auto output_sizes = output.get_layout().size.sizes();
-
-    uint32_t batch_size = output_sizes[0];
-    uint32_t feature_size = output_sizes[1];
-    uint32_t y_size = output_sizes[3];
-    uint32_t x_size = output_sizes[2];
+    uint32_t batch_size = output_size.batch[0];
+    uint32_t feature_size = output_size.feature[0];
+    uint32_t y_size = output_size.spatial[1];
+    uint32_t x_size = output_size.spatial[0];
 
     auto buff = output.pointer<T>();
 
@@ -112,7 +103,8 @@ void mvn_compute_mean_within_channels_bfyx(cldnn::memory &output, bool normalize
             {
                 for (uint32_t x = 0; x < x_size; ++x)
                 {
-                    size_t data_index = generic_test::get_linear_index(output.get_layout(), b, f, y, x, output_desc);
+                    auto index_tensor = tensor(batch(b), feature(f), spatial(x, y, 0, 0));
+                    size_t data_index = output.get_layout().get_linear_offset(index_tensor);
                     float data = static_cast<float>(buff[data_index]);
                     sum += data;
                     if (normalize_variance)
@@ -121,13 +113,13 @@ void mvn_compute_mean_within_channels_bfyx(cldnn::memory &output, bool normalize
             }
             sum /= y_size * x_size;
             T result_sum = static_cast<T>(sum);
-            EXPECT_NEAR(result_sum, 0.f, err_margin);
+            EXPECT_NEAR(result_sum, 0.f, err_margin) << "at b=" << b << ", f=" << f;
 
             if (normalize_variance)
             {
                 variance /= y_size * x_size;
                 T result_variance = static_cast<T>(variance);
-                EXPECT_NEAR(result_variance, 1.f, err_margin);
+                EXPECT_NEAR(result_variance, 1.f, err_margin) << " at b=" << b << ", f=" << f;
             }
         }
     }
@@ -158,7 +150,7 @@ TEST(mvn_gpu_test, mvn_test_across_channels_bfyx)
     EXPECT_EQ(outputs.begin()->first, "mvn");
 
     auto output = outputs.begin()->second.get_memory();
-    mvn_compute_mean_accross_channels_bfyx<float>(output, false);
+    mvn_compute_mean_accross_channels<float>(output, false);
 }
 
 TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_fp16)
@@ -186,7 +178,7 @@ TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_fp16)
     EXPECT_EQ(outputs.begin()->first, "mvn");
 
     auto output = outputs.begin()->second.get_memory();
-    mvn_compute_mean_accross_channels_bfyx<FLOAT16>(output, false);
+    mvn_compute_mean_accross_channels<FLOAT16>(output, false);
 }
 
 TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_normalize_variance)
@@ -214,7 +206,7 @@ TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_normalize_variance)
     EXPECT_EQ(outputs.begin()->first, "mvn");
 
     auto output = outputs.begin()->second.get_memory();
-    mvn_compute_mean_accross_channels_bfyx<float>(output, true);
+    mvn_compute_mean_accross_channels<float>(output, true);
 }
 
 TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_normalize_variance_fp16)
@@ -242,7 +234,7 @@ TEST(mvn_gpu_test, mvn_test_across_channels_bfyx_normalize_variance_fp16)
     EXPECT_EQ(outputs.begin()->first, "mvn");
 
     auto output = outputs.begin()->second.get_memory();
-    mvn_compute_mean_accross_channels_bfyx<FLOAT16>(output, true);
+    mvn_compute_mean_accross_channels<FLOAT16>(output, true);
 }
 
 TEST(mvn_gpu_test, mvn_test_within_channels_bfyx)
@@ -270,7 +262,7 @@ TEST(mvn_gpu_test, mvn_test_within_channels_bfyx)
     EXPECT_EQ(outputs.begin()->first, "mvn");
 
     auto output = outputs.begin()->second.get_memory();
-    mvn_compute_mean_within_channels_bfyx<float>(output, false);
+    mvn_compute_mean_within_channels<float>(output, false);
 }
 
 TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_fp16)
@@ -298,7 +290,7 @@ TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_fp16)
     EXPECT_EQ(outputs.begin()->first, "mvn");
 
     auto output = outputs.begin()->second.get_memory();
-    mvn_compute_mean_within_channels_bfyx<FLOAT16>(output, false);
+    mvn_compute_mean_within_channels<FLOAT16>(output, false);
 }
 
 TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_normalize_variance)
@@ -326,7 +318,7 @@ TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_normalize_variance)
     EXPECT_EQ(outputs.begin()->first, "mvn");
 
     auto output = outputs.begin()->second.get_memory();
-    mvn_compute_mean_within_channels_bfyx<float>(output, true);
+    mvn_compute_mean_within_channels<float>(output, true);
 }
 
 TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_normalize_variance_fp16)
@@ -354,5 +346,147 @@ TEST(mvn_gpu_test, mvn_test_within_channels_bfyx_normalize_variance_fp16)
     EXPECT_EQ(outputs.begin()->first, "mvn");
 
     auto output = outputs.begin()->second.get_memory();
-    mvn_compute_mean_within_channels_bfyx<FLOAT16>(output, true);
+    mvn_compute_mean_within_channels<FLOAT16>(output, true);
 }
+
+struct mvn_basic_test_params {
+    format::type input_format;
+    data_types input_type;
+    tensor input_size;
+    bool across_channels;
+    bool normalize_variance;
+    padding output_pad;
+};
+
+struct mvn_random_test : ::testing::TestWithParam<mvn_basic_test_params> {
+    template <typename T>
+    void fill_data(memory& mem, const tests::VVVVF<T>& data) {
+        auto size = mem.get_layout().size;
+        auto ptr = mem.pointer<T>();
+        for (size_t bi = 0; bi < static_cast<size_t>(size.batch[0]); ++bi) {
+            for (size_t fi = 0; fi < static_cast<size_t>(size.feature[0]); ++fi) {
+                for (size_t yi = 0; yi < static_cast<size_t>(size.spatial[1]); ++yi) {
+                    for (size_t xi = 0; xi < static_cast<size_t>(size.spatial[0]); ++xi) {
+                        auto tensor_addr = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+                        auto offset = mem.get_layout().get_linear_offset(tensor_addr);
+                        ptr[offset] = data[bi][fi][xi][yi];
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void fill_random_data(memory& mem, int min, int max, int k = 8) {
+        auto size = mem.get_layout().size;
+        auto input_data = tests::generate_random_4d<T>(size.batch[0], size.feature[0], size.spatial[0], size.spatial[1], min, max, k);
+        fill_data(mem, input_data);
+    }
+
+    void check_result(memory& output, bool across_channels, bool normalize_variance) {
+        if (output.get_layout().data_type == data_types::f32) {
+            if (across_channels) {
+                mvn_compute_mean_accross_channels<float>(output, normalize_variance);
+            } else {
+                mvn_compute_mean_within_channels<float>(output, normalize_variance);
+            }
+        } else if (output.get_layout().data_type == data_types::f16) {
+            if (across_channels) {
+                mvn_compute_mean_accross_channels<FLOAT16>(output, normalize_variance);
+            } else {
+                mvn_compute_mean_within_channels<FLOAT16>(output, normalize_variance);
+            }
+        }
+    }
+
+    void execute(const mvn_basic_test_params& params, const engine& eng) {
+        auto& size = params.input_size;
+        auto& output_pad = params.output_pad;
+
+        auto input = memory::allocate(eng, { params.input_type, params.input_format, size });
+
+        switch (params.input_type) {
+        case data_types::f32:
+            fill_random_data<float>(input, -127, 127);
+            break;
+        case data_types::f16:
+            fill_random_data<FLOAT16>(input, -127, 127);
+            break;
+        case data_types::i8:
+            fill_random_data<int8_t>(input, -127, 127);
+            break;
+        case data_types::u8:
+            fill_random_data<uint8_t>(input, -127, 127);
+            break;
+        default:
+            break;
+        }
+
+        topology topo;
+        topo.add(input_layout("input", input.get_layout()));
+        auto prim = mvn("mvn", "input", params.across_channels, params.normalize_variance);
+        prim.output_padding = output_pad;
+        topo.add(prim);
+
+        network net(eng, topo);
+
+        net.set_input_data("input", input);
+
+        auto outputs = net.execute();
+        EXPECT_EQ(outputs.size(), size_t(1));
+        EXPECT_EQ(outputs.begin()->first, "mvn");
+
+        auto output = outputs.begin()->second.get_memory();
+        check_result(output, params.across_channels, params.normalize_variance);
+    }
+};
+
+TEST_P(mvn_random_test, random) {
+    auto eng = tests::get_test_engine();
+    this->execute(GetParam(), eng);
+}
+
+struct mvn_test_case_generator : std::vector<mvn_basic_test_params> {
+    mvn_test_case_generator& add(mvn_basic_test_params params) {
+        push_back(params);
+        return *this;
+    }
+
+    mvn_test_case_generator& smoke_tests(format::type fmt, data_types in_dt) {
+        push_back(mvn_basic_test_params{ fmt, in_dt, {7, 10, 17, 13}, false, false, padding() });
+        push_back(mvn_basic_test_params{ fmt, in_dt, {7, 10, 17, 13}, false, true, padding() });
+        push_back(mvn_basic_test_params{ fmt, in_dt, {7, 10, 17, 13}, true, false, padding() });
+        push_back(mvn_basic_test_params{ fmt, in_dt, {7, 10, 17, 13}, true, true, padding() });
+        return *this;
+    }
+
+    mvn_test_case_generator& extended_tests(format::type fmt, data_types in_dt) {
+        push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, false, false, padding() });
+        push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, false, true, padding() });
+        push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, true, false, padding() });
+        push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, true, true, padding() });
+        // output padding
+        push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, false, false, padding({0, 0, 1, 1}) });
+        push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, false, true, padding({0, 0, 1, 1}) });
+        push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, true, false, padding({0, 0, 1, 1}) });
+        push_back(mvn_basic_test_params{ fmt, in_dt, {2, 17, 67, 71}, true, true, padding({0, 0, 1, 1}) });
+
+        return *this;
+    }
+};
+
+INSTANTIATE_TEST_CASE_P(smoke,
+                        mvn_random_test,
+                        testing::ValuesIn(
+                            mvn_test_case_generator()
+                            .smoke_tests(format::b_fs_yx_fsv16, data_types::i8)
+                            .smoke_tests(format::b_fs_yx_fsv16, data_types::u8)
+                        ), );
+
+INSTANTIATE_TEST_CASE_P(extended,
+                        mvn_random_test,
+                        testing::ValuesIn(
+                            mvn_test_case_generator()
+                            .extended_tests(format::b_fs_yx_fsv16, data_types::i8)
+                            .extended_tests(format::b_fs_yx_fsv16, data_types::u8)
+                        ), );
index 5ab274c..6926224 100644 (file)
@@ -1151,35 +1151,6 @@ TEST(reorder_gpu_opt, remove_redundant_activation_fuse)
     EXPECT_FLOAT_EQ(out_ptr[1], -0.02f);
 }
 
-TEST(reorder_gpu_opt, basic_do_not_remove_redundant_due_it_is_output)
-{
-    engine eng;
-
-    memory in = memory::allocate(eng, { data_types::f32, format::yxfb, tensor{ 1, 2, 2, 1 } });
-    memory weights = memory::allocate(eng, { data_types::f32, format::bfyx, tensor{ 1, 2, 2, 1 } });
-    topology tpl{
-        input_layout("in", in.get_layout()),
-        convolution("conv", "in", { "weights" }),
-        data("weights", weights),
-        reorder("r1", "conv", format::bfyx, data_types::f32) //reoder is output - do not optimize
-    };
-
-    build_options opts;
-    opts.set_option(build_option::optimize_data(true));
-
-    network net(eng, tpl, opts);
-    net.set_input_data("in", in);
-    auto outputs = net.execute();
-    auto executed_primitives = net.get_executed_primitives();
-
-    //all pirmitives in this test needs to be executed
-    EXPECT_TRUE(executed_primitives.count("conv") == 1);
-    EXPECT_TRUE(executed_primitives.count("in") == 1);
-    EXPECT_TRUE(executed_primitives.count("r1") == 1);
-    ASSERT_TRUE(outputs.count("r1") == 1);
-    EXPECT_TRUE(outputs.at("r1").get_memory().get_layout().format == format::bfyx);
-}
-
 TEST(reorder_gpu_opt, basic_remove_redundant_output_due_to_implicit_reorders)
 {
     engine eng;
@@ -1915,6 +1886,98 @@ TEST(reorder_gpu, any_format) {
     }
 }
 
+TEST(reorder_image2d_rgba_to_bfyx_gpu, basic)
+{
+    const auto& engine = get_test_engine();
+
+    auto input = memory::allocate(engine, { data_types::u8, format::image_2d_rgba, { 1, 3, 2, 2 } });
+    layout output_layout(data_types::f16, format::bfyx, { 1, 3, 2, 2 });
+
+    set_values<unsigned char>(input, {
+        1, 0, 5, 7,
+        2, 111, 123, 8,
+        124, 125, 50, 9,
+        251, 252, 253, 210
+        });
+
+    topology topology(
+        input_layout("input", input.get_layout()),
+        reorder("reorder", "input", output_layout));
+
+    network network(engine, topology);
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "reorder");
+
+    auto output = outputs.begin()->second.get_memory();
+
+    float answers[12] = {
+        1.0f,  2.0f,
+        124.0f,  251.0f,
+
+        0.0f,  111.0f,
+        125.0f,  252.0f,
+
+        5.0f,  123.0f,
+        50.0f, 253.0f,
+    };
+
+    auto output_ptr = output.pointer<FLOAT16>();
+    for (int i = 0; i < 12; i++)
+    {
+        EXPECT_NEAR(FLOAT16(answers[i] / 255.f), output_ptr[i], 1e-3f);
+    }
+
+}
+
+TEST(reorder_bfyx_to_image2d_rgba_gpu, basic)
+{
+    const auto& engine = get_test_engine();
+
+    auto input = memory::allocate(engine, { data_types::f16, format::bfyx, { 1, 3, 2, 2 } });
+    layout output_layout(data_types::u8, format::image_2d_rgba, { 1, 3, 2, 2 });
+
+    set_values<FLOAT16>(input, {
+        FLOAT16(1.0f / 255.f),  FLOAT16(2.0f / 255.f),
+        FLOAT16(124.0f / 255.f),  FLOAT16(251.0f / 255.f),
+
+        FLOAT16(0.0f / 255.f),  FLOAT16(111.0f / 255.f),
+        FLOAT16(125.0f / 255.f),  FLOAT16(252.0f / 255.f),
+
+        FLOAT16(5.0f / 255.f),  FLOAT16(123.0f / 255.f),
+        FLOAT16(50.0f / 255.f), FLOAT16(253.0f / 255.f),
+        });
+
+    topology topology(
+        input_layout("input", input.get_layout()),
+        reorder("reorder", "input", output_layout));
+
+    network network(engine, topology);
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "reorder");
+
+    auto output = outputs.begin()->second.get_memory();
+
+    unsigned char answers[16] = {
+        1, 0, 5, 0,
+        2, 111, 123, 0,
+        124, 125, 50, 0,
+        251, 252, 253, 0
+    };
+
+    auto output_ptr = output.pointer<unsigned char>();
+    for (int i = 0; i < 16; i++)
+    {
+        EXPECT_EQ(answers[i], output_ptr[i]);
+    }
+
+}
+
 using namespace cldnn;
 
 class reorder_test : public tests::generic_test
index 4fd9c0a..8e0b382 100644 (file)
@@ -523,3 +523,165 @@ TEST(resample_gpu, bilinear_asymmetric) {
         }
     }
 }
+
+struct resample_random_test_params {
+    data_types input_type;
+    tensor input_size;
+    tensor output_size;
+    uint32_t num_filter;
+    resample_type operation_type;
+    format::type in_format;
+    format::type out_format;
+};
+
+struct resample_random_test : testing::TestWithParam<resample_random_test_params>{
+    template <typename T>
+    void fill_random_typed(memory& mem, int min, int max) {
+        auto size = mem.get_layout().size;
+        size_t b = size.batch[0];
+        size_t f = size.feature[0];
+        size_t x = size.spatial[0];
+        size_t y = size.spatial[1];
+
+        auto data = generate_random_4d<T>(b, f, y, x, min, max);
+        auto ptr = mem.pointer<T>();
+        for (size_t bi = 0; bi < b; ++bi) {
+            for (size_t fi = 0; fi < f; ++fi) {
+                for (size_t yi = 0; yi < y; ++yi) {
+                    for (size_t xi = 0; xi < x; ++xi) {
+                        auto coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+                        auto offset = mem.get_layout().get_linear_offset(coords);
+                        ptr[offset] = data[bi][fi][yi][xi];
+                    }
+                }
+            }
+        }
+    }
+
+    void fill_random(memory& mem) {
+        auto dt = mem.get_layout().data_type;
+        switch (dt) {
+        case data_types::f32:
+            fill_random_typed<float>(mem, -127, 127);
+            break;
+        case data_types::f16:
+            fill_random_typed<FLOAT16>(mem, -127, 127);
+            break;
+        case data_types::i8:
+            fill_random_typed<int8_t>(mem, -127, 127);
+            break;
+        case data_types::u8:
+            fill_random_typed<uint8_t>(mem, 0, 255);
+            break;
+        default:
+            break;
+        }
+    }
+
+    template <typename T>
+    void compare_nearest_typed(const memory& input, const memory& output) {
+        auto output_lay = output.get_layout();
+        size_t b = output_lay.size.batch[0];
+        size_t f = output_lay.size.feature[0];
+        size_t x = output_lay.size.spatial[0];
+        size_t y = output_lay.size.spatial[1];
+        float x_ratio = static_cast<float>(input.get_layout().size.spatial[0]) / static_cast<float>(x);
+        float y_ratio = static_cast<float>(input.get_layout().size.spatial[1]) / static_cast<float>(y);
+
+        auto in_ptr = input.pointer<T>();
+        auto out_ptr = output.pointer<T>();
+        for (size_t bi = 0; bi < b; ++bi) {
+            for (size_t fi = 0; fi < f; ++fi) {
+                for (size_t yi = 0; yi < y; ++yi) {
+                    for (size_t xi = 0; xi < x; ++xi) {
+                        auto in_xi = static_cast<size_t>(floor(x_ratio * xi));
+                        auto in_yi = static_cast<size_t>(floor(y_ratio * yi));
+                        auto in_coords = tensor(batch(bi), feature(fi), spatial(in_xi, in_yi, 0, 0));
+                        auto in_offset = input.get_layout().get_linear_offset(in_coords);
+                        auto in_val = in_ptr[in_offset];
+                        auto out_coords = tensor(batch(bi), feature(fi), spatial(xi, yi, 0, 0));
+                        auto out_offset = output.get_layout().get_linear_offset(out_coords);
+                        auto out_val = out_ptr[out_offset];
+                        EXPECT_EQ(in_val, out_val) << " at bi=" << bi << ", fi=" << fi << ", xi=" << xi << ", yi=" << yi;
+                    }
+                }
+            }
+        }
+    }
+
+    void compare(const memory& input, const memory& output, resample_type operation) {
+        auto dt = output.get_layout().data_type;
+        if (operation == resample_type::nearest) {
+            if (dt == data_types::f32) {
+                compare_nearest_typed<float>(input, output);
+            } else if (dt == data_types::f16) {
+                compare_nearest_typed<FLOAT16>(input, output);
+            } else if (dt == data_types::i8) {
+                compare_nearest_typed<int8_t>(input, output);
+            } else if (dt == data_types::u8) {
+                compare_nearest_typed<uint8_t>(input, output);
+            } else {
+                FAIL() << "Not supported data type: " << static_cast<size_t>(dt);
+            }
+        } else {
+            FAIL() << "Not supported resample_type: " << static_cast<int32_t>(operation);
+        }
+    }
+
+    void execute(const resample_random_test_params& params) {
+        auto eng = get_test_engine();
+
+        auto in_layout = layout(params.input_type, params.in_format, params.input_size);
+
+        auto topo = topology(
+            input_layout("in", in_layout),
+            resample("resample", "in", params.output_size, params.num_filter, params.operation_type)
+        );
+
+        auto build_opts = build_options(
+            build_option::force_implementations({ {"resample", {params.out_format, ""}} })
+        );
+        auto net = network(eng, topo, build_opts);
+
+        auto in_mem = memory::allocate(eng, in_layout);
+        fill_random(in_mem);
+        net.set_input_data("in", in_mem);
+
+        auto result = net.execute();
+        auto output = result.at("resample").get_memory();
+
+        compare(in_mem, output, params.operation_type);
+    }
+};
+
+TEST_P(resample_random_test, random) {
+    execute(GetParam());
+}
+
+struct resample_random_test_param_generator : std::vector<resample_random_test_params> {
+    resample_random_test_param_generator& add(resample_random_test_params params) {
+        push_back(params);
+        return *this;
+    }
+
+    resample_random_test_param_generator& smoke_params(data_types type, format::type input_format, format::type output_format) {
+        push_back(resample_random_test_params{ type, {1, 17, 5, 9}, {1, 17, 15, 18}, 1, resample_type::nearest, input_format, output_format });
+        push_back(resample_random_test_params{ type, {2, 17, 5, 9}, {2, 17, 15, 18}, 1, resample_type::nearest, input_format, output_format });
+        push_back(resample_random_test_params{ type, {1, 7, 10, 17}, {1, 7, 21, 35}, 1, resample_type::nearest, input_format, output_format });
+        push_back(resample_random_test_params{ type, {2, 7, 10, 17}, {2, 7, 21, 35}, 1, resample_type::nearest, input_format, output_format });
+        return *this;
+    }
+
+};
+
+INSTANTIATE_TEST_CASE_P(smoke,
+                        resample_random_test,
+                        testing::ValuesIn(
+                            resample_random_test_param_generator()
+                            .smoke_params(data_types::i8, format::byxf_af32, format::byxf_af32)
+                            .smoke_params(data_types::u8, format::byxf_af32, format::byxf_af32)
+                            .smoke_params(data_types::i8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
+                            .smoke_params(data_types::u8, format::b_fs_yx_fsv4, format::b_fs_yx_fsv4)
+                            .smoke_params(data_types::i8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
+                            .smoke_params(data_types::u8, format::b_fs_yx_fsv16, format::b_fs_yx_fsv16)
+                        ), );
index d199df4..985162c 100644 (file)
@@ -614,3 +614,113 @@ TEST(strided_slice_gpu_f32, test_2x2x2x1x1_2) {
         EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
     }
 }
+
+TEST(strided_slice_gpu_f32, test_2x2x2x2_full_negative_stride) {
+    // Input (BFYX): 2x2x2x2
+    // Begin (BFYX): 0x0x0x0
+    // End (BFYX): 2x2x2x2
+    // Stride (BFYX): -1x1x1x1
+    // Output (BFYX): 2x2x2x2
+
+    const auto& engine = get_test_engine();
+    auto input = memory::allocate(engine, { data_types::f32, format::bfyx, { 2, 2, 2, 2 } });
+    auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+    auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 4, 1, 1, 1 } });
+
+    set_values(input, {
+            0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
+            9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f
+    });
+    set_values(begin, {
+            0, 0, 0, 0
+    });
+    set_values(end, {
+            2, 2, 2, 2
+    });
+    set_values(strides, {
+            -1, -1, 1, 1
+    });
+
+    topology topology;
+    topology.add(input_layout("input", input.get_layout()));
+    topology.add(data("input2", begin));
+    topology.add(data("input3", end));
+    topology.add(data("input4", strides));
+    topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}, {}));
+
+    network network(engine, topology);
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "strided_slice");
+
+    auto output = outputs.at("strided_slice").get_memory();
+
+    std::vector<float> answers = {
+            12.f, 13.f, 14.f, 15.f, 8.f, 9.f, 10.f, 11.f, 4.f, 5.f, 6.f, 7.f, 0.f, 1.f, 2.f, 3.f };
+
+    auto output_ptr = output.pointer<float>();
+
+    ASSERT_EQ(output_ptr.size(), answers.size());
+    for (size_t i = 0; i < answers.size(); ++i)
+    {
+        EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
+    }
+}
+
+TEST(strided_slice_gpu_f32, test_2x2x2x1x1_2_negative_all) {
+    // Input (BFZYX): 2x2x2x1x1
+    // Output (BFZYX): 2x1x1x1x1
+
+    const auto& engine = get_test_engine();
+    auto input = memory::allocate(engine, { data_types::f32, format::bfzyx, { 2, 2, 1, 1, 2 } });
+    auto begin = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+    auto end = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+    auto strides = memory::allocate(engine, { data_types::i32, format::bfyx, { 3, 1, 1, 1 } });
+
+    set_values(input, {
+            0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f
+    });
+    set_values(begin, {
+            0, 0, 0
+    });
+    set_values(end, {
+            2, 2, 2
+    });
+    set_values(strides, {
+            1, 2, 2
+    });
+
+    topology topology;
+    topology.add(input_layout("input", input.get_layout()));
+    topology.add(data("input2", begin));
+    topology.add(data("input3", end));
+    topology.add(data("input4", strides));
+    topology.add(strided_slice("strided_slice", "input", "input2", "input3", "input4", {}, {}, {}, {}));
+
+    network network(engine, topology);
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+
+    EXPECT_EQ(outputs.size(), size_t(1));
+    EXPECT_EQ(outputs.begin()->first, "strided_slice");
+
+    auto output = outputs.at("strided_slice").get_memory();
+
+    std::vector<float> answers = {
+            0.0f, 4.0f
+    };
+
+    auto output_ptr = output.pointer<float>();
+
+    for (size_t i = 0; i < answers.size(); ++i)
+    {
+        EXPECT_TRUE(are_equal(answers[i], output_ptr[i]));
+    }
+}
index fd5214c..d7be15a 100644 (file)
@@ -185,27 +185,35 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(ref_convolution_bwd_data_t<f32, f32, f32, f32>),
 #ifdef ENABLE_UNUSED_PRIM
     INSTANCE(ref_convolution_bwd_weights_t<f32, f32, f32, f32>),
+#endif
     /* conv (bfloat16) */
     INSTANCE(_jit_uni_dw_convolution_fwd_t<avx512_core, bf16, bf16>),
     INSTANCE(_jit_uni_dw_convolution_fwd_t<avx512_core, bf16, f32>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(_jit_uni_dw_convolution_bwd_data_t<avx512_core, bf16, bf16>),
     INSTANCE(_jit_uni_dw_convolution_bwd_data_t<avx512_core, bf16, f32>),
     INSTANCE(_jit_uni_dw_convolution_bwd_weights_t<avx512_core, bf16, bf16>),
     INSTANCE(_jit_uni_dw_convolution_bwd_weights_t<avx512_core, bf16, f32>),
+#endif
     INSTANCE(jit_avx512_core_bf16_1x1_convolution_fwd_t<f32>),
     INSTANCE(jit_avx512_core_bf16_1x1_convolution_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(jit_avx512_core_bf16_1x1_convolution_bwd_data_t<f32>),
     INSTANCE(jit_avx512_core_bf16_1x1_convolution_bwd_data_t<bf16>),
     INSTANCE(jit_avx512_core_bf16_1x1_convolution_bwd_weights_t<f32>),
     INSTANCE(jit_avx512_core_bf16_1x1_convolution_bwd_weights_t<bf16>),
+#endif
     INSTANCE(jit_avx512_core_bf16_convolution_fwd_t<f32>),
     INSTANCE(jit_avx512_core_bf16_convolution_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(jit_avx512_core_bf16_convolution_bwd_data_t<f32>),
     INSTANCE(jit_avx512_core_bf16_convolution_bwd_data_t<bf16>),
     INSTANCE(jit_avx512_core_bf16_convolution_bwd_weights_t<bf16>),
     INSTANCE(jit_avx512_core_bf16_convolution_bwd_weights_t<f32>),
+#endif
     INSTANCE(gemm_bf16_convolution_fwd_t<f32>),
     INSTANCE(gemm_bf16_convolution_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(gemm_bf16_convolution_bwd_data_t<f32>),
     INSTANCE(gemm_bf16_convolution_bwd_data_t<bf16>),
     INSTANCE(gemm_bf16_convolution_bwd_weights_t<f32>),
@@ -314,8 +322,8 @@ static const pd_create_f cpu_impl_list[] = {
 #endif
     /* eltwise */
     INSTANCE(jit_uni_eltwise_fwd_t<avx512_common, f32>),
-#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(jit_uni_eltwise_fwd_t<avx512_common, bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(jit_uni_eltwise_bwd_t<avx512_common, f32>),
     INSTANCE(jit_uni_eltwise_bwd_t<avx512_common, bf16>),
 #endif
@@ -329,8 +337,8 @@ static const pd_create_f cpu_impl_list[] = {
 #endif
 
     INSTANCE(ref_eltwise_fwd_t<f32>),
-#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(ref_eltwise_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(ref_eltwise_bwd_t<f32>),
     INSTANCE(ref_eltwise_bwd_t<bf16>),
 #endif
@@ -358,8 +366,10 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(ref_softmax_fwd_t<f32>),
 #ifdef ENABLE_UNUSED_PRIM
     INSTANCE(ref_softmax_bwd_t<f32>),
+#endif
     /* pool */
     INSTANCE(jit_uni_pooling_fwd_t<avx512_common, bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(jit_uni_pooling_bwd_t<avx512_common, bf16>),
 #endif
     INSTANCE(jit_uni_pooling_fwd_t<avx512_common, f32>),
@@ -373,14 +383,17 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_uni_pooling_fwd_t<sse42, f32>),
 #ifdef ENABLE_UNUSED_PRIM
     INSTANCE(jit_uni_pooling_bwd_t<sse42, f32>),
+#endif
     INSTANCE(nchw_pooling_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(nchw_pooling_bwd_t<bf16>),
 #endif
     INSTANCE(nchw_pooling_fwd_t<f32>),
 #ifdef ENABLE_UNUSED_PRIM
     INSTANCE(nchw_pooling_bwd_t<f32>),
-
+#endif
     INSTANCE(nhwc_pooling_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(nhwc_pooling_bwd_t<bf16>),
 #endif
     INSTANCE(nhwc_pooling_fwd_t<f32>),
@@ -389,8 +402,8 @@ static const pd_create_f cpu_impl_list[] = {
 #endif
 
     INSTANCE(ref_pooling_fwd_t<f32, f32>),
-#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(ref_pooling_fwd_t<bf16, bf16, f32>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(ref_pooling_bwd_t<f32, f32>),
     INSTANCE(ref_pooling_bwd_t<bf16, bf16>),
 #endif
@@ -414,7 +427,9 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_avx512_common_lrn_fwd_t<f32>),
 #ifdef ENABLE_UNUSED_PRIM
     INSTANCE(jit_avx512_common_lrn_bwd_t<f32>),
+#endif
     INSTANCE(jit_avx512_common_lrn_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(jit_avx512_common_lrn_bwd_t<bf16>),
 #endif
     INSTANCE(jit_uni_lrn_fwd_t<avx2>),
@@ -425,7 +440,9 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(ref_lrn_fwd_t<f32>),
 #ifdef ENABLE_UNUSED_PRIM
     INSTANCE(ref_lrn_bwd_t<f32>),
+#endif
     INSTANCE(ref_lrn_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(ref_lrn_bwd_t<bf16>),
 #endif
     /* batch normalization */
@@ -475,9 +492,11 @@ static const pd_create_f cpu_impl_list[] = {
 #ifdef ENABLE_UNUSED_PRIM
     INSTANCE(ref_inner_product_bwd_data_t<f32, f32, f32, f32>),
     INSTANCE(ref_inner_product_bwd_weights_t<f32>),
+#endif
     /* inner product (bfloat16) */
     INSTANCE(gemm_bf16_inner_product_fwd_t<f32>),
     INSTANCE(gemm_bf16_inner_product_fwd_t<bf16>),
+#ifdef ENABLE_UNUSED_PRIM
     INSTANCE(gemm_bf16_inner_product_bwd_data_t<f32>),
     INSTANCE(gemm_bf16_inner_product_bwd_data_t<bf16>),
     INSTANCE(gemm_bf16_inner_product_bwd_weights_t<f32>),
index 4652f0c..6ab2965 100644 (file)
@@ -173,8 +173,8 @@ void _jit_avx512_core_bf16_convolution_fwd_t<dst_type>::execute_forward_2d()
         balance211(work_amount, nthr, ithr, start, end);
         auto par_conv = jit_conv_call_s();
 
-        size_t src_h_stride = src_d.blk_off(0, 0, 1);
-        size_t dst_h_stride = dst_d.blk_off(0, 0, 1);
+        size_t src_h_stride = src_d.blk_off(0, 0, 1) - src_d.off_l(0);
+        size_t dst_h_stride = dst_d.blk_off(0, 0, 1) - dst_d.off_l(0);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
 
         int n{0}, g{0}, occ{0}, oh_s{0}, owb{0};
@@ -279,9 +279,9 @@ void _jit_avx512_core_bf16_convolution_fwd_t<dst_type>::execute_forward_3d()
         balance211(work_amount, nthr, ithr, start, end);
         auto par_conv = jit_conv_call_s();
 
-        size_t src_d_stride = src_d.blk_off(0, 0, 1);
-        size_t src_h_stride = src_d.blk_off(0, 0, 0, 1);
-        size_t dst_h_stride = dst_d.blk_off(0, 0, 0, 1);
+        size_t src_d_stride = src_d.blk_off(0, 0, 1) - src_d.off_l(0);
+        size_t src_h_stride = src_d.blk_off(0, 0, 0, 1) - src_d.off_l(0);
+        size_t dst_h_stride = dst_d.blk_off(0, 0, 0, 1) - dst_d.off_l(0);
         size_t wht_d_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
 
@@ -447,7 +447,7 @@ void _jit_avx512_core_bf16_convolution_bwd_data_t<diff_src_type>
             }
             assert(kd_len >= 0);
 
-            auto diff_src_w = diff_src + 
+            auto diff_src_w = diff_src +
                 diff_src_d.blk_off(n, g_icb, id_s);
             auto diff_dst_w = diff_dst + diff_dst_d.blk_off(n, g_ocb, od_s);
             auto wht_w = weights + wht_blk_off(weights_d, g, 0, icb, kd_lo);
@@ -538,8 +538,8 @@ void _jit_avx512_core_bf16_convolution_bwd_data_t<diff_src_type>
         balance211(work_amount, nthr, ithr, start, end);
 
         auto par_conv = jit_conv_call_s();
-        size_t diff_src_h_stride = diff_src_d.blk_off(0, 0, 1);
-        size_t diff_dst_h_stride = diff_dst_d.blk_off(0, 0, 1);
+        size_t diff_src_h_stride = diff_src_d.blk_off(0, 0, 1) - diff_src_d.off_l(0);
+        size_t diff_dst_h_stride = diff_dst_d.blk_off(0, 0, 1) - diff_dst_d.off_l(0);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
 
         bool is_fast_path = jcp.dilate_h == 0 && jcp.stride_h == 1;
@@ -562,9 +562,9 @@ void _jit_avx512_core_bf16_convolution_bwd_data_t<diff_src_type>
             int work_rem = end - start;
             int ih_e = ih_s + work_rem > jcp.ih ? jcp.ih : ih_s + work_rem;
 
-            auto diff_src_w = diff_src + 
+            auto diff_src_w = diff_src +
                 diff_src_d.blk_off(n, g_icb);
-            auto diff_dst_w = diff_dst + 
+            auto diff_dst_w = diff_dst +
                 diff_dst_d.blk_off(n, g_ocb);
             auto wht_w = weights + wht_blk_off(weights_d, g, 0, icb);
 
diff --git a/inference-engine/thirdparty/movidius/XLink/pc/Win/include/win_synchapi.h b/inference-engine/thirdparty/movidius/XLink/pc/Win/include/win_synchapi.h
new file mode 100644 (file)
index 0000000..32584a3
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef WIN_SYNCHAPI
+#define WIN_SYNCHAPI
+
+#include "win_pthread.h"
+#include "synchapi.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _pthread_condattr_t pthread_condattr_t;
+
+typedef struct
+{
+    CONDITION_VARIABLE _cv;
+}
+pthread_cond_t;
+
+int pthread_cond_init(pthread_cond_t* __cond, const pthread_condattr_t* __cond_attr);
+int pthread_cond_destroy(pthread_cond_t* __cond);
+
+int pthread_cond_timedwait(pthread_cond_t* __cond,
+    pthread_mutex_t* __mutex,
+    const struct timespec* __abstime);
+int pthread_cond_broadcast(pthread_cond_t* __cond);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* WIN_MUTEX */
diff --git a/inference-engine/thirdparty/movidius/XLink/pc/Win/src/win_synchapi.c b/inference-engine/thirdparty/movidius/XLink/pc/Win/src/win_synchapi.c
new file mode 100644 (file)
index 0000000..26bd365
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "win_synchapi.h"
+
+int pthread_cond_init(pthread_cond_t* __cond, const pthread_condattr_t* __cond_attr)
+{
+    if (__cond == NULL) {
+        return ERROR_INVALID_HANDLE;
+    }
+
+    (void)__cond_attr;
+    InitializeConditionVariable(&__cond->_cv);
+    return 0;
+}
+
+int pthread_cond_destroy(pthread_cond_t* __cond)
+{
+    (void)__cond;
+    return 0;
+}
+
+int pthread_cond_timedwait(pthread_cond_t* __cond,
+    pthread_mutex_t* __mutex,
+    const struct timespec* __abstime) 
+{
+    if (__cond == NULL) {
+        return ERROR_INVALID_HANDLE;
+    }
+
+    long long msec = INFINITE;
+    if (__abstime != NULL) {
+        msec = __abstime->tv_sec * 1000 + __abstime->tv_nsec / 1000000;
+    }
+
+    return SleepConditionVariableCS(&__cond->_cv, __mutex, (DWORD)msec);
+}
+
+int pthread_cond_broadcast(pthread_cond_t *__cond)
+{
+    if (__cond == NULL) {
+        return ERROR_INVALID_HANDLE;
+    }
+
+    WakeConditionVariable(&__cond->_cv);
+    return 0;
+}
index 8f439e6..0c5e91b 100644 (file)
@@ -13,7 +13,7 @@ namespace Watchdog {
  */
 class IDevice {
  public:
-    using time_point = std::chrono::high_resolution_clock::time_point;
+    using time_point = std::chrono::steady_clock::time_point;
 
     virtual ~IDevice() = default;
 
index 10faaf8..6e022c6 100644 (file)
@@ -12,6 +12,7 @@
 #include <watchdogPrivate.hpp>
 #include <algorithm>
 #include <memory>
+#include <string>
 #include <ncCommPrivate.h>
 #include <mvnc.h>
 #include <ncPrivateTypes.h>
 #include "XLinkPrivateDefines.h"
 #include "XLinkErrorUtils.h"
 
+#if defined(_WIN32)
+#include "win_synchapi.h"
+#endif // defined(_WIN32)
+
 namespace {
 
 using namespace std;
@@ -34,7 +39,7 @@ using namespace Watchdog;
  */
 class XLinkDevice : public IDevice {
     _devicePrivate_t privateDevice;
-    using time_point = std::chrono::high_resolution_clock::time_point;
+    using time_point = std::chrono::steady_clock::time_point;
     time_point lastPongTime = time_point::min();
     time_point lastPingTime = time_point::min();
     enum : int { deviceHangTimeout = 12000};
@@ -162,13 +167,6 @@ struct wd_context_opaque {
 };
 
 class WatchdogImpl {
-    enum : uint8_t {
-        STATE_IDLE = 0,
-        INITIATE_THREAD_STOP = 1,
-        THREAD_EXITED = 2,
-        WAKE_UP_THREAD = 3,
-    };
-
     using wd_context_as_tuple = std::tuple<std::shared_ptr<IDevice>, bool*, void*>;
 
     using Devices = std::list<wd_context_as_tuple>;
@@ -176,35 +174,82 @@ class WatchdogImpl {
     std::mutex devicesListAcc;
     std::atomic<int> generation = {0};
     std::atomic_bool threadRunning;
-    volatile std::uint8_t notificationReason = STATE_IDLE;
-    std::condition_variable wakeUpPingThread;
 
+    pthread_mutex_t routineLock;
+    pthread_cond_t  wakeUpPingThread;
     std::thread poolThread;
 
-    WatchdogImpl() = default;
     WatchdogImpl(const WatchdogImpl&) = delete;
     WatchdogImpl(WatchdogImpl&&) = delete;
     WatchdogImpl& operator = (const WatchdogImpl&) = delete;
     WatchdogImpl& operator = (WatchdogImpl&&) = delete;
- public:
+
+private:
+
+    WatchdogImpl() {
+        int rc = pthread_mutex_init(&routineLock, NULL);
+        if (rc != 0) {
+            throw std::runtime_error("failed to initialize \"routineLock\" mutex. rc: " + std::to_string(rc));
+        }
+
+#if !(defined(__APPLE__) || defined(_WIN32))
+        pthread_condattr_t attr;
+        rc = pthread_condattr_init(&attr);
+        if (rc != 0) {
+            throw std::runtime_error("failed to initialize condition variable attribute. rc: " + std::to_string(rc));
+        }
+
+        rc = pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+        if (rc != 0) {
+            throw std::runtime_error("failed to set condition variable clock. rc: " + std::to_string(rc));
+        }
+#endif // !(defined(__APPLE__) || defined(_WIN32))
+
+        rc = pthread_cond_init(&wakeUpPingThread, NULL);
+        if (rc != 0) {
+            throw std::runtime_error("failed to initialize \"wakeUpPingThread\" condition variable. rc: " + std::to_string(rc));
+        }
+    }
+
+public:
 
     static WatchdogImpl &instance() {
         static WatchdogImpl watchdog;
         return watchdog;
     }
 
+
     ~WatchdogImpl() {
         mvLog(MVLOG_INFO, "watchdog terminated\n");
+        try
         {
-            auto __lock = lock();
+            lockRoutineMutex();
             for (auto &item : watchedDevices) {
                 *std::get<1>(item) = true;
                 mvLog(MVLOG_WARN, "[%p] device, stop watching due to watchdog termination\n", std::get<2>(item));
             }
-            notificationReason = THREAD_EXITED;
+            unlockRoutineMutex();
+        } catch (const std::exception & ex) {
+            mvLog(MVLOG_ERROR, "error %s", ex.what());
+        } catch (...) {
+            mvLog(MVLOG_ERROR, "unknown error");
         }
 
-        wakeUpPingThread.notify_one();
+        threadRunning = false;
+        int rc = pthread_cond_broadcast(&wakeUpPingThread);
+        if (rc != 0) {
+            mvLog(MVLOG_WARN, "failed to unblock threads blocked on the \"wakeUpPingThread\". rc=%d", rc);
+        }
+
+        rc = pthread_mutex_destroy(&routineLock);
+        if (rc != 0) {
+            mvLog(MVLOG_WARN, "failed to destroy the \"routineLock\". rc=%d", rc);
+        }
+
+        rc = pthread_cond_destroy(&wakeUpPingThread);
+        if (rc != 0) {
+            mvLog(MVLOG_WARN, "failed to destroy the \"wakeUpPingThread\". rc=%d", rc);
+        }
 
         if (poolThread.joinable()) {
             poolThread.join();
@@ -213,7 +258,7 @@ class WatchdogImpl {
 
 public:
     void *register_device(std::shared_ptr<IDevice> device) {
-        auto __locker = lock();
+        lockRoutineMutex();
         std::unique_ptr<wd_context_opaque> ctx (new wd_context_opaque);
 
         // rare case of exact pointer address collision
@@ -240,8 +285,10 @@ public:
             });
         } else {
             // wake up thread
-            notificationReason = WAKE_UP_THREAD;
-            wakeUpPingThread.notify_one();
+            int rc = pthread_cond_broadcast(&wakeUpPingThread);
+            if (rc != 0) {
+                mvLog(MVLOG_WARN, "failed to unblock threads blocked on the \"wakeUpPingThread\". rc=%d", rc);
+            }
         }
 
         ctx->handleCached = device->getHandle();
@@ -249,6 +296,7 @@ public:
 
         ctx->actual = std::get<0>(watchedDevices.back()).get();
 
+        unlockRoutineMutex();
         return ctx.release();
     }
 
@@ -262,11 +310,12 @@ public:
         if (ptr == nullptr) {
             return false;
         }
-        auto __locker = lock();
+        lockRoutineMutex();
 
         // thread already removed
         if (ptr->destroyed) {
             delete ptr;
+            unlockRoutineMutex();
             return true;
         }
 
@@ -282,16 +331,28 @@ public:
         }
 
         // wake up thread since we might select removed device as nex to be ping, and there is no more devices available
-        notificationReason = WAKE_UP_THREAD;
-        __locker.unlock();
-        wakeUpPingThread.notify_one();
+        unlockRoutineMutex();
+        int rc = pthread_cond_broadcast(&wakeUpPingThread);
+        if (rc != 0) {
+            mvLog(MVLOG_WARN, "failed to unblock threads blocked on the \"wakeUpPingThread\". rc=%d", rc);
+        }
 
         return bFound;
     }
 
  private:
-    std::unique_lock<std::mutex> lock() {
-        return std::unique_lock<std::mutex>(devicesListAcc);
+    void lockRoutineMutex() {
+        int rc = pthread_mutex_lock(&routineLock);
+        if (rc != 0) {
+            throw std::runtime_error("failed to lock \"routineLock\" mutex. rc: " + std::to_string(rc));
+        }
+    }
+
+    void unlockRoutineMutex() {
+        int rc = pthread_mutex_unlock(&routineLock);
+        if (rc != 0) {
+            throw std::runtime_error("failed to unlock \"routineLock\" mutex. rc: " + std::to_string(rc));
+        }
     }
 
     void watchdog_routine() noexcept {
@@ -299,14 +360,16 @@ public:
             mvLog(MVLOG_INFO, "thread started\n");
 
             milliseconds sleepInterval;
-            auto __locker = lock();
+            struct timespec timeToWait = {0, 0};
+            lockRoutineMutex();
+
             do {
                 for (auto deviceIt = watchedDevices.begin(); deviceIt != watchedDevices.end(); ) {
                     auto &device = std::get<0>(*deviceIt);
-                    auto isReady = device->dueIn(high_resolution_clock::now()).count() == 0;
+                    auto isReady = device->dueIn(steady_clock::now()).count() == 0;
                     if (isReady) {
                         auto now = high_resolution_clock::now();
-                        device->keepAlive(high_resolution_clock::now());
+                        device->keepAlive(steady_clock::now());
                         mvLog(MVLOG_DEBUG, "ping completed in %ld ms\n", duration_cast<std::chrono::milliseconds>(high_resolution_clock ::now()-now).count());
                     }
                     if (device->isTimeout()) {
@@ -319,7 +382,7 @@ public:
                         ++deviceIt;
                     }
                 }
-                auto currentTime = high_resolution_clock::now();
+                auto currentTime = steady_clock::now();
                 auto minInterval = std::min_element(watchedDevices.begin(),
                                                     watchedDevices.end(),
                                                     [&currentTime] (const Devices::value_type & device1, const Devices::value_type & device2) {
@@ -336,26 +399,39 @@ public:
                 sleepInterval = std::get<0>(*minInterval)->dueIn(currentTime);
                 mvLog(MVLOG_DEBUG, "sleep interval = %ld ms\n", sleepInterval.count());
 
-                notificationReason = STATE_IDLE;
+                auto sec = std::chrono::duration_cast<std::chrono::seconds>(sleepInterval);
 
-                wakeUpPingThread.wait_until(__locker, currentTime + sleepInterval, [this, currentTime]() {
-                    mvLog(MVLOG_DEBUG,
-                          "waiting for %ld ms\n",
-                          duration_cast<std::chrono::milliseconds>(high_resolution_clock::now() - currentTime).count());
-                    return notificationReason != STATE_IDLE;
-                });
+#if (defined(__APPLE__) || defined(_WIN32))
+                timeToWait.tv_sec = sec.count();
+                timeToWait.tv_nsec =
+                    std::chrono::duration_cast<std::chrono::nanoseconds>(sleepInterval).count() -
+                    std::chrono::nanoseconds(sec).count();
+#else
+                clock_gettime(CLOCK_MONOTONIC, &timeToWait);
+                timeToWait.tv_sec += sec.count();
+                timeToWait.tv_nsec +=
+                    std::chrono::duration_cast<std::chrono::nanoseconds>(sleepInterval).count() -
+                    std::chrono::nanoseconds(sec).count();
+#endif // (defined(__APPLE__) || defined(_WIN32))
+
+#if defined(__APPLE__)
+                pthread_cond_timedwait_relative_np(&wakeUpPingThread, &routineLock, &timeToWait);
+#else
+                pthread_cond_timedwait(&wakeUpPingThread, &routineLock, &timeToWait);
+#endif // defined(__APPLE__)
 
-                mvLog(MVLOG_DEBUG, "waiting completed in  %ld ms\n",
-                      duration_cast<std::chrono::milliseconds>(high_resolution_clock ::now() - currentTime).count());
-            } while (notificationReason != THREAD_EXITED);
 
+                mvLog(MVLOG_DEBUG, "waiting completed in  %ld ms\n",
+                      duration_cast<std::chrono::milliseconds>(steady_clock::now() - currentTime).count());
+            } while (threadRunning);
         } catch (const std::exception & ex) {
-            mvLog(MVLOG_ERROR, "error %s\n", ex.what());
+            mvLog(MVLOG_ERROR, "error %s", ex.what());
         } catch (...) {
-            mvLog(MVLOG_ERROR, "error\n");
+            mvLog(MVLOG_ERROR, "unknown error");
         }
+
+        unlockRoutineMutex();
         mvLog(MVLOG_INFO, "thread ended\n");
-        threadRunning = false;
     }
 };
 
@@ -440,25 +516,33 @@ WD_API wd_error_t watchdog_register_device(wd_context * ctx, devicePrivate_t *de
 }
 
 WD_API wd_error_t watchdog_unregister_device(wd_context *ctx) {
-    if (ctx == nullptr || ctx->opaque == nullptr) {
-        return WD_NOTINITIALIZED;
-    } else {
-        if (ctx->opaque != WD_OPAQUE_MAGIC) {
-            auto watchee = reinterpret_cast<wd_context_opaque*>(ctx->opaque);
-            // NOTE: magic field used to pass preallocated watchee - since this function only used by plugin, this is not a backdoor
-            if (watchee->magic == WD_OPAQUE_MAGIC) {
-                if (!WatchdogImpl::instance().remove_device(ctx->opaque)) {
-                    mvLog(MVLOG_WARN, "cannot remove device\n");
-                    return WD_FAIL;
+    try {
+        if (ctx == nullptr || ctx->opaque == nullptr) {
+            return WD_NOTINITIALIZED;
+        } else {
+            if (ctx->opaque != WD_OPAQUE_MAGIC) {
+                auto watchee = reinterpret_cast<wd_context_opaque *>(ctx->opaque);
+                // NOTE: magic field used to pass preallocated watchee - since this function only used by plugin, this is not a backdoor
+                if (watchee->magic == WD_OPAQUE_MAGIC) {
+                    if (!WatchdogImpl::instance().remove_device(ctx->opaque)) {
+                        mvLog(MVLOG_WARN, "cannot remove device\n");
+                        return WD_FAIL;
+                    }
                 }
             }
         }
-    }
 
-    if (ctx != nullptr) {
-        // opaque pointer deleted
-        ctx->opaque = nullptr;
+        if (ctx != nullptr) {
+            // opaque pointer deleted
+            ctx->opaque = nullptr;
+        }
+
+        return WD_ERRNO;
+    } catch (const std::exception & ex) {
+        mvLog(MVLOG_ERROR, "error %s", ex.what());
+    } catch (...) {
+        mvLog(MVLOG_ERROR, "unknown error");
     }
 
-    return WD_ERRNO;
+    return WD_FAIL;
 }
index edec297..4bfed66 100644 (file)
@@ -195,4 +195,4 @@ Below are fragments of sample output for CPU and FPGA devices:
 ## See Also
 * [Using Inference Engine Samples](./docs/IE_DG/Samples_Overview.md)
 * [Model Optimizer](./docs/MO_DG/Deep_Learning_Model_Optimizer_DevGuide.md)
-* [Model Downloader](./tools/downloader/README.md)
\ No newline at end of file
+* [Model Downloader](./tools/downloader/README.md)
index 25168ba..55247b6 100644 (file)
@@ -48,5 +48,5 @@ add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
 # install
 
 install(TARGETS compile_tool
-        RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH}
+        RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH}
         COMPONENT core)
index 798b3b9..072db1b 100644 (file)
@@ -49,5 +49,5 @@ add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
 # install
 
 install(TARGETS ${TARGET_NAME}
-        RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH}
+        RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH}
         COMPONENT myriad)
\ No newline at end of file
index e2dc3b9..bb8ba29 100644 (file)
@@ -54,6 +54,6 @@ if(ENABLE_MYRIAD)
     add_perfcheck_target(myriad_perfcheck myriadPlugin)
 
     install(TARGETS myriad_perfcheck
-            RUNTIME DESTINATION ${IE_CPACK_LIBRARY_PATH}
+            RUNTIME DESTINATION ${IE_CPACK_RUNTIME_PATH}
             COMPONENT myriad)
 endif()
diff --git a/model-optimizer/automation/create_package.py b/model-optimizer/automation/create_package.py
new file mode 100644 (file)
index 0000000..2112e5a
--- /dev/null
@@ -0,0 +1,19 @@
+import argparse
+import os
+from shutil import rmtree
+
+from utils import Automation
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--build_number", type=str, help="Build number to be added to package version", default="0", )
+args = parser.parse_args()
+
+auto = Automation()
+base_dir = os.path.dirname(__file__)
+bom_path = os.path.join(base_dir, "package_BOM.txt")
+bom = auto.parse_bom(bom_path=bom_path)
+dir_to_tar = auto.copy_files_from_bom(root_path=os.path.join(os.path.dirname(__file__), ".."), bom=bom)
+auto.add_version_txt(dst_path=dir_to_tar, build_number=args.build_number)
+
+auto.make_tarfile(out_file_name="mo_for_tf_{0}.tar.gz".format(args.build_number), source_dir=dir_to_tar)
+rmtree(dir_to_tar)
diff --git a/model-optimizer/automation/package_BOM.txt b/model-optimizer/automation/package_BOM.txt
new file mode 100644 (file)
index 0000000..7369c75
--- /dev/null
@@ -0,0 +1,963 @@
+extensions/__init__.py
+extensions/analysis/__init__.py
+extensions/analysis/boolean_input.py
+extensions/analysis/inputs.py
+extensions/analysis/json_print.py
+extensions/analysis/nodes.py
+extensions/analysis/tf_od_api.py
+extensions/analysis/tf_retinanet.py
+extensions/analysis/tf_yolo.py
+extensions/back/__init__.py
+extensions/back/ActivationsNormalizer.py
+extensions/back/AvgPool.py
+extensions/back/blob_normalizer.py
+extensions/back/compress_quantized_weights.py
+extensions/back/ConvolutionNormalizer.py
+extensions/back/CropToStridedSlice.py
+extensions/back/CutMemory.py
+extensions/back/disable_unsupported_ND_operations.py
+extensions/back/DumpFakeQuantStat.py
+extensions/back/ElementwiseOpsToEltwiseOps.py
+extensions/back/EnableConstantStridedSlice.py
+extensions/back/ForceStrictPrecision.py
+extensions/back/fuse_sub_div_min.py
+extensions/back/FuseTransposesSequence.py
+extensions/back/GatherNormalizer.py
+extensions/back/GroupedConvWeightsNormalize.py
+extensions/back/I64ToI32.py
+extensions/back/insert_compatibility_l2normalization.py
+extensions/back/InterpolateToInterpOrResample.py
+extensions/back/kaldi_remove_memory_output.py
+extensions/back/LeakyReLUMutation.py
+extensions/back/LeakyReluToReluWithNegativeSlope.py
+extensions/back/LRNToNorm.py
+extensions/back/LSTMCellNormalizer.py
+extensions/back/MatMulNormalizer.py
+extensions/back/MaxPool.py
+extensions/back/NonMaximumSuppressionNormalize.py
+extensions/back/NormalizeToNormalizeL2.py
+extensions/back/OneHotNormalizer.py
+extensions/back/op_versioning.py
+extensions/back/OptimizeTransposeReshapeSequence.py
+extensions/back/PackBinaryWeights.py
+extensions/back/PadToV7.py
+extensions/back/ParameterToPlaceholder.py
+extensions/back/pass_separator.py
+extensions/back/priorbox_mutation.py
+extensions/back/ProposalMutation.py
+extensions/back/ReduceToPooling.py
+extensions/back/ReduceTransposeDimensions.py
+extensions/back/remove_last_softmax_pattern.py
+extensions/back/RemoveUselessConvert.py
+extensions/back/Reshape0DToSqueeze.py
+extensions/back/ReshapeMutation.py
+extensions/back/ResultNormalizer.py
+extensions/back/ReverseInputChannels.py
+extensions/back/RNNSequenceTypeRename.py
+extensions/back/ScalarConstNormalize.py
+extensions/back/SelectBroadcast.py
+extensions/back/ShapeOfToShape.py
+extensions/back/ShuffleChannelPatternOptimization.py
+extensions/back/ShufflenetReLUReorder.py
+extensions/back/SpecialNodesFinalization.py
+extensions/back/split_normalizer.py
+extensions/back/StridedSliceMasksNormalizer.py
+extensions/back/TileNormalizer.py
+extensions/back/TopKNormalizer.py
+extensions/back/TransposeReduceFusing.py
+extensions/back/TransposeToPermute.py
+extensions/back/UselessConcatRemoval.py
+extensions/front/__init__.py
+extensions/front/ArgMaxSqueeze.py
+extensions/front/ATenToEmbeddingBag.py
+extensions/front/AttributedGatherNormalizer.py
+extensions/front/AttributedPadToPad.py
+extensions/front/binary_quantize_normalization.py
+extensions/front/caffe/__init__.py
+extensions/front/caffe/accum_ext.py
+extensions/front/caffe/argmax_ext.py
+extensions/front/caffe/ArgMaxFlatten.py
+extensions/front/caffe/axpy.py
+extensions/front/caffe/binarization.py
+extensions/front/caffe/binary_conv_ext.py
+extensions/front/caffe/bn.py
+extensions/front/caffe/conv_ext.py
+extensions/front/caffe/correlation_ext.py
+extensions/front/caffe/ctcgreedydecoder_ext.py
+extensions/front/caffe/CustomLayersMapping.xml.example
+extensions/front/caffe/data_augmentation_ext.py
+extensions/front/caffe/detection_output.py
+extensions/front/caffe/elementwise_ext.py
+extensions/front/caffe/eltwise_add_normalize.py
+extensions/front/caffe/elu.py
+extensions/front/caffe/flatten_ext.py
+extensions/front/caffe/grn_ext.py
+extensions/front/caffe/inner_product_ext.py
+extensions/front/caffe/input_ext.py
+extensions/front/caffe/interp_ext.py
+extensions/front/caffe/lrn_ext.py
+extensions/front/caffe/mvn_ext.py
+extensions/front/caffe/normalize_ext.py
+extensions/front/caffe/permute_ext.py
+extensions/front/caffe/pooling_ext.py
+extensions/front/caffe/power_file_ext.py
+extensions/front/caffe/prelu_ext.py
+extensions/front/caffe/priorbox_clustered_ext.py
+extensions/front/caffe/priorbox_ext.py
+extensions/front/caffe/proposal_ext.py
+extensions/front/caffe/proposal_python_ext.py
+extensions/front/caffe/psroipooling_ext.py
+extensions/front/caffe/regionyolo_ext.py
+extensions/front/caffe/relu6.py
+extensions/front/caffe/relu_ext.py
+extensions/front/caffe/reorgyolo_ext.py
+extensions/front/caffe/resample_ext.py
+extensions/front/caffe/reshape.py
+extensions/front/caffe/ShuffleChannel.py
+extensions/front/caffe/shufflechannel_ext.py
+extensions/front/caffe/sigmoid.py
+extensions/front/caffe/simplernms_ext.py
+extensions/front/caffe/slice_to_split.py
+extensions/front/caffe/softmax_ext.py
+extensions/front/caffe/spatial_transformer_ext.py
+extensions/front/caffe/split_to_identity.py
+extensions/front/caffe/tanh.py
+extensions/front/ChangeCastOutputType.py
+extensions/front/ChangePlaceholderTypes.py
+extensions/front/create_tensor_nodes.py
+extensions/front/disable_weights_quantize_value_propagation.py
+extensions/front/div.py
+extensions/front/eltwise_n.py
+extensions/front/ExpandDimsToUnsqueeze.py
+extensions/front/FillToBroadcast.py
+extensions/front/flatten_to_reshape.py
+extensions/front/freeze_placeholder_value.py
+extensions/front/GeLUMerger_Erf.py
+extensions/front/global_pooling_to_reduce.py
+extensions/front/image_scaler.py
+extensions/front/input_cut.py
+extensions/front/instance_normalization.py
+extensions/front/InterpolateNormalizer.py
+extensions/front/kaldi/__init__.py
+extensions/front/kaldi/add_permute_after_convolution.py
+extensions/front/kaldi/add_reshape_around_convolution.py
+extensions/front/kaldi/add_reshape_around_pooling.py
+extensions/front/kaldi/apply_counts.py
+extensions/front/kaldi/logsoftmax_component_ext.py
+extensions/front/kaldi/memory_offset_adjustment.py
+extensions/front/kaldi/replace_eltwise_nin1.py
+extensions/front/kaldi/replace_lstm_node_pattern.py
+extensions/front/kaldi/replace_lstm_nonlinearity.py
+extensions/front/kaldi/set_ports.py
+extensions/front/kaldi/sigmoid_ext.py
+extensions/front/kaldi/split_memoryoffsets.py
+extensions/front/kaldi/tanh_component_ext.py
+extensions/front/Log1p.py
+extensions/front/LogSoftmax.py
+extensions/front/LRNReplacer.py
+extensions/front/MatMul_normalizer.py
+extensions/front/MoveEmbeddedInputsToInputs.py
+extensions/front/mxnet/__init__.py
+extensions/front/mxnet/activation.py
+extensions/front/mxnet/adaptive_avg_pooling_ext.py
+extensions/front/mxnet/add_input_data_to_prior_boxes.py
+extensions/front/mxnet/arange_ext.py
+extensions/front/mxnet/arange_replacer.py
+extensions/front/mxnet/block_grad_ext.py
+extensions/front/mxnet/box_nms_ext.py
+extensions/front/mxnet/cast_ext.py
+extensions/front/mxnet/check_softmax_node_inputs.py
+extensions/front/mxnet/clip_ext.py
+extensions/front/mxnet/conv_ext.py
+extensions/front/mxnet/copy_ext.py
+extensions/front/mxnet/crop_ext.py
+extensions/front/mxnet/custom.py
+extensions/front/mxnet/custom_rpn_proposal.py
+extensions/front/mxnet/deformable_conv_ext.py
+extensions/front/mxnet/deformable_psroi_pooling_ext.py
+extensions/front/mxnet/dropout_ext.py
+extensions/front/mxnet/elementwise_ext.py
+extensions/front/mxnet/eltwise_scalar_replacers.py
+extensions/front/mxnet/exp_ext.py
+extensions/front/mxnet/expand_dims_ext.py
+extensions/front/mxnet/flatten_ext.py
+extensions/front/mxnet/fully_connected_ext.py
+extensions/front/mxnet/gather.py
+extensions/front/mxnet/gather_ext.py
+extensions/front/mxnet/instance_norm_ext.py
+extensions/front/mxnet/leaky_relu.py
+extensions/front/mxnet/lrn_ext.py
+extensions/front/mxnet/max_ext.py
+extensions/front/mxnet/multibox_detection_ext.py
+extensions/front/mxnet/mx_reshape_reverse.py
+extensions/front/mxnet/mx_reshape_to_reshape.py
+extensions/front/mxnet/MXRepeatReplacer.py
+extensions/front/mxnet/null_ext.py
+extensions/front/mxnet/pad_ext.py
+extensions/front/mxnet/pooling_ext.py
+extensions/front/mxnet/proposal_ext.py
+extensions/front/mxnet/psroi_pooling_ext.py
+extensions/front/mxnet/repeat_ext.py
+extensions/front/mxnet/reshape_ext.py
+extensions/front/mxnet/RNN_ext.py
+extensions/front/mxnet/rnn_param_concat.py
+extensions/front/mxnet/roi_pooling_ext.py
+extensions/front/mxnet/shape_array_ext.py
+extensions/front/mxnet/sigmoid.py
+extensions/front/mxnet/slice_channel_ext.py
+extensions/front/mxnet/slice_ext.py
+extensions/front/mxnet/slice_like_ext.py
+extensions/front/mxnet/slice_replacers.py
+extensions/front/mxnet/softmax.py
+extensions/front/mxnet/softmax_activation_ext.py
+extensions/front/mxnet/softmax_ext.py
+extensions/front/mxnet/softmax_output_ext.py
+extensions/front/mxnet/squeeze_ext.py
+extensions/front/mxnet/ssd_anchor_reshape.py
+extensions/front/mxnet/ssd_detection_output_replacer.py
+extensions/front/mxnet/ssd_pattern_flatten_softmax_activation.py
+extensions/front/mxnet/ssd_pattern_remove_flatten.py
+extensions/front/mxnet/ssd_pattern_remove_reshape.py
+extensions/front/mxnet/ssd_pattern_remove_transpose.py
+extensions/front/mxnet/ssd_reorder_detection_out_inputs.py
+extensions/front/mxnet/stack_ext.py
+extensions/front/mxnet/swapaxis_ext.py
+extensions/front/mxnet/tile_ext.py
+extensions/front/mxnet/tile_replacer.py
+extensions/front/mxnet/transpose_ext.py
+extensions/front/mxnet/up_sampling_ext.py
+extensions/front/mxnet/where_ext.py
+extensions/front/mxnet/yolo_v3_mobilenet1_voc.json
+extensions/front/mxnet/zeros_ext.py
+extensions/front/no_op_eraser.py
+extensions/front/onnx/__init__.py
+extensions/front/onnx/activation_ext.py
+extensions/front/onnx/affine_ext.py
+extensions/front/onnx/argmax_ext.py
+extensions/front/onnx/aten_ext.py
+extensions/front/onnx/cast_ext.py
+extensions/front/onnx/clip_ext.py
+extensions/front/onnx/const_ext.py
+extensions/front/onnx/constant_fill_ext.py
+extensions/front/onnx/constant_of_shape_ext.py
+extensions/front/onnx/constant_of_shape_to_broadcast.py
+extensions/front/onnx/conv_ext.py
+extensions/front/onnx/crop_ext.py
+extensions/front/onnx/deformable_conv_ext.py
+extensions/front/onnx/detection_output.py
+extensions/front/onnx/detectionoutput_ext.py
+extensions/front/onnx/dropout_ext.py
+extensions/front/onnx/elementwise_ext.py
+extensions/front/onnx/expand_ext.py
+extensions/front/onnx/flatten_ext.py
+extensions/front/onnx/flattenONNX_to_reshape.py
+extensions/front/onnx/gather_ext.py
+extensions/front/onnx/gemm_ext.py
+extensions/front/onnx/group_norm_ext.py
+extensions/front/onnx/gru_ext.py
+extensions/front/onnx/hard_sigmoid_ext.py
+extensions/front/onnx/image_scaler_ext.py
+extensions/front/onnx/instance_normalization_ext.py
+extensions/front/onnx/lp_normalization_ext.py
+extensions/front/onnx/lrn_ext.py
+extensions/front/onnx/lstm_ext.py
+extensions/front/onnx/mask_rcnn.json
+extensions/front/onnx/mask_rcnn_conversion.py
+extensions/front/onnx/matmul_ext.py
+extensions/front/onnx/mean_variance_normalization_ext.py
+extensions/front/onnx/non_max_suppression_ext.py
+extensions/front/onnx/non_max_suppression_normalize.py
+extensions/front/onnx/non_zero_ext.py
+extensions/front/onnx/normalize_ext.py
+extensions/front/onnx/normalize_l2_normalize.py
+extensions/front/onnx/one_hot_ext.py
+extensions/front/onnx/one_hot_normalize.py
+extensions/front/onnx/pad_ext.py
+extensions/front/onnx/parameter_ext.py
+extensions/front/onnx/person_detection_crossroad.json
+extensions/front/onnx/person_detection_crossroad_conversion.py
+extensions/front/onnx/pooling_ext.py
+extensions/front/onnx/priorbox_clustered_ext.py
+extensions/front/onnx/priorbox_ext.py
+extensions/front/onnx/priorgridgenerator_ext.py
+extensions/front/onnx/proposal_ext.py
+extensions/front/onnx/quantize_ext.py
+extensions/front/onnx/reduce_max_ext.py
+extensions/front/onnx/reduce_mean_ext.py
+extensions/front/onnx/reduce_min_ext.py
+extensions/front/onnx/reduce_prod_ext.py
+extensions/front/onnx/reduce_sum_ext.py
+extensions/front/onnx/remove_filtering_boxes_by_size.py
+extensions/front/onnx/resize_ext.py
+extensions/front/onnx/resize_to_interpolate.py
+extensions/front/onnx/reverse_sequence_ext.py
+extensions/front/onnx/rnn_ext.py
+extensions/front/onnx/roialign_ext.py
+extensions/front/onnx/roifeatureextractor_ext.py
+extensions/front/onnx/scatter_ext.py
+extensions/front/onnx/shape_ext.py
+extensions/front/onnx/slice_ext.py
+extensions/front/onnx/softmax_ext.py
+extensions/front/onnx/split_ext.py
+extensions/front/onnx/squeeze_ext.py
+extensions/front/onnx/top_k_ext.py
+extensions/front/onnx/topkrois_ext.py
+extensions/front/onnx/transpose_ext.py
+extensions/front/onnx/unsqueeze_ext.py
+extensions/front/onnx/upsample_ext.py
+extensions/front/output_cut.py
+extensions/front/override_batch.py
+extensions/front/Pack.py
+extensions/front/pass_separator.py
+extensions/front/PowerToEltwises.py
+extensions/front/rank_decomposer.py
+extensions/front/reciprocal.py
+extensions/front/reduce_axis_normalizer.py
+extensions/front/reshape_dim_normalizer.py
+extensions/front/restore_ports.py
+extensions/front/scatter_normalizer.py
+extensions/front/softmax.py
+extensions/front/softsign_replacer.py
+extensions/front/split_normalizer.py
+extensions/front/squared_difference.py
+extensions/front/SqueezeNormalize.py
+extensions/front/standalone_const_eraser.py
+extensions/front/sub.py
+extensions/front/tf/__init__.py
+extensions/front/tf/activation_ext.py
+extensions/front/tf/argmax_ext.py
+extensions/front/tf/assign_elimination.py
+extensions/front/tf/basic_lstm_cell.py
+extensions/front/tf/batch_to_space_ext.py
+extensions/front/tf/BatchMatMul_ext.py
+extensions/front/tf/BatchToSpaceNDToUpsample.py
+extensions/front/tf/BlockLSTM.py
+extensions/front/tf/BlockLSTM_ext.py
+extensions/front/tf/bucketize.py
+extensions/front/tf/bucketize_ext.py
+extensions/front/tf/Cast_ext.py
+extensions/front/tf/concat.py
+extensions/front/tf/concat_ext.py
+extensions/front/tf/const_ext.py
+extensions/front/tf/conv_ext.py
+extensions/front/tf/crop_and_resize_ext.py
+extensions/front/tf/CropAndResizeReplacement.py
+extensions/front/tf/CTCGreedyDecoder.py
+extensions/front/tf/CTCGreedyDecoder_ext.py
+extensions/front/tf/deconv_ext.py
+extensions/front/tf/depth_to_space.py
+extensions/front/tf/elementwise_ext.py
+extensions/front/tf/expand_dims_ext.py
+extensions/front/tf/extract_image_patches_ext.py
+extensions/front/tf/fake_const_ext.py
+extensions/front/tf/FakeQuantWithMinMaxVars.py
+extensions/front/tf/FakeQuantWithMinMaxVars_ext.py
+extensions/front/tf/faster_rcnn_support.json
+extensions/front/tf/faster_rcnn_support_api_v1.10.json
+extensions/front/tf/faster_rcnn_support_api_v1.13.json
+extensions/front/tf/faster_rcnn_support_api_v1.14.json
+extensions/front/tf/faster_rcnn_support_api_v1.15.json
+extensions/front/tf/faster_rcnn_support_api_v1.7.json
+extensions/front/tf/fifo_queue_v2_ext.py
+extensions/front/tf/fifo_replacer.py
+extensions/front/tf/fill_ext.py
+extensions/front/tf/FlattenToReshape.py
+extensions/front/tf/floor_ext.py
+extensions/front/tf/gather_ext.py
+extensions/front/tf/GatherTree_ext.py
+extensions/front/tf/GNMT_DynamicSequenceLengths.py
+extensions/front/tf/identity_ext.py
+extensions/front/tf/InterpolateTransposes.py
+extensions/front/tf/IteratorGetNext_ext.py
+extensions/front/tf/LoopCond_ext.py
+extensions/front/tf/lrn_ext.py
+extensions/front/tf/mask_rcnn_support.json
+extensions/front/tf/mask_rcnn_support_api_v1.11.json
+extensions/front/tf/mask_rcnn_support_api_v1.13.json
+extensions/front/tf/mask_rcnn_support_api_v1.14.json
+extensions/front/tf/mask_rcnn_support_api_v1.15.json
+extensions/front/tf/mask_rcnn_support_api_v1.7.json
+extensions/front/tf/matmul_ext.py
+extensions/front/tf/mvn.py
+extensions/front/tf/mvn_unrolled.py
+extensions/front/tf/nearest_neighbor_upsampling.py
+extensions/front/tf/next_iteration_ext.py
+extensions/front/tf/non_max_suppression_ext.py
+extensions/front/tf/non_max_suppression_normalize.py
+extensions/front/tf/ObjectDetectionAPI.py
+extensions/front/tf/one_hot_ext.py
+extensions/front/tf/pad_ext.py
+extensions/front/tf/pad_tf_to_pad.py
+extensions/front/tf/placeholder_ext.py
+extensions/front/tf/placeholder_with_default_ext.py
+extensions/front/tf/pooling_ext.py
+extensions/front/tf/prelu.py
+extensions/front/tf/reduce_ext.py
+extensions/front/tf/reshape_related_ext.py
+extensions/front/tf/resize_bilinear.py
+extensions/front/tf/resize_nearest_neighbor.py
+extensions/front/tf/retinanet.json
+extensions/front/tf/RetinaNetFilteredDetectionsReplacement.py
+extensions/front/tf/reverse_sequence.py
+extensions/front/tf/reverse_v2.py
+extensions/front/tf/rfcn_support.json
+extensions/front/tf/rfcn_support_api_v1.10.json
+extensions/front/tf/rfcn_support_api_v1.13.json
+extensions/front/tf/rfcn_support_api_v1.14.json
+extensions/front/tf/select_ext.py
+extensions/front/tf/sign_ext.py
+extensions/front/tf/SizeReplacer.py
+extensions/front/tf/slice_ext.py
+extensions/front/tf/softmax_ext.py
+extensions/front/tf/space_to_batch.py
+extensions/front/tf/space_to_batch_ext.py
+extensions/front/tf/space_to_depth_ext.py
+extensions/front/tf/sparse_fill_empty_rows_ext.py
+extensions/front/tf/sparse_segment_mean_ext.py
+extensions/front/tf/sparse_segment_sqrtn_ext.py
+extensions/front/tf/sparse_segment_sum_ext.py
+extensions/front/tf/sparse_to_dense_ext.py
+extensions/front/tf/sparse_weighted_sum.py
+extensions/front/tf/split_ext.py
+extensions/front/tf/SplitConcatPairToInterpolate.py
+extensions/front/tf/ssd_support.json
+extensions/front/tf/ssd_support_api_v1.14.json
+extensions/front/tf/ssd_support_api_v1.15.json
+extensions/front/tf/ssd_toolbox_detection_output.json
+extensions/front/tf/ssd_toolbox_multihead_detection_output.json
+extensions/front/tf/ssd_v2_support.json
+extensions/front/tf/SSDToolboxDetectionOutput.py
+extensions/front/tf/swap_deconv_inputs.py
+extensions/front/tf/swish.py
+extensions/front/tf/SwitchMergeOptimization.py
+extensions/front/tf/TensorArrayExtractors.py
+extensions/front/tf/TensorArrayGatherV3.py
+extensions/front/tf/tensorflow_custom_operations_config_update.py
+extensions/front/tf/tile_ext.py
+extensions/front/tf/topk_ext.py
+extensions/front/tf/transpose_ext.py
+extensions/front/tf/transposed_mvn_unrolled.py
+extensions/front/tf/unique_ext.py
+extensions/front/tf/UnpackPackReverseInputChannels.py
+extensions/front/tf/variable_ext.py
+extensions/front/tf/variables_values_freezing.py
+extensions/front/tf/yolo_v1.json
+extensions/front/tf/yolo_v1_tiny.json
+extensions/front/tf/yolo_v2.json
+extensions/front/tf/yolo_v2_tiny.json
+extensions/front/tf/yolo_v2_tiny_voc.json
+extensions/front/tf/yolo_v2_voc.json
+extensions/front/tf/yolo_v3.json
+extensions/front/tf/yolo_v3_tiny.json
+extensions/front/tf/yolo_v3_voc.json
+extensions/front/TopKNormalize.py
+extensions/front/transformations_config.py
+extensions/front/TransposeOrderNormalizer.py
+extensions/front/user_data_repack.py
+extensions/front/YOLO.py
+extensions/load/__init__.py
+extensions/load/caffe/__init__.py
+extensions/load/caffe/loader.py
+extensions/load/kaldi/__init__.py
+extensions/load/kaldi/loader.py
+extensions/load/loader.py
+extensions/load/mxnet/__init__.py
+extensions/load/mxnet/loader.py
+extensions/load/onnx/__init__.py
+extensions/load/onnx/loader.py
+extensions/load/tf/__init__.py
+extensions/load/tf/loader.py
+extensions/middle/__init__.py
+extensions/middle/AddFakeQuantizeFuse.py
+extensions/middle/AddIsCyclicAttribute.py
+extensions/middle/AddMeanScaleValues.py
+extensions/middle/AnchorToPriorBox.py
+extensions/middle/ApplyNHWCtoNCHWpermutation.py
+extensions/middle/ApplyPermutations.py
+extensions/middle/ArgMaxToTopK.py
+extensions/middle/AttributedTileNormalizer.py
+extensions/middle/BiasAddBroadcasting.py
+extensions/middle/BinarizeWeightsM1P1.py
+extensions/middle/BlockLSTMtoLSTMSequence.py
+extensions/middle/CheckForCycle.py
+extensions/middle/ConcatOptimization.py
+extensions/middle/ConstSwitchResolver.py
+extensions/middle/ConvertGroupedStridedSlice.py
+extensions/middle/ConvertLayoutDependentOperations.py
+extensions/middle/ConvertMultiInputConv.py
+extensions/middle/ConvToBinaryConv.py
+extensions/middle/CustomSubgraphCall.py
+extensions/middle/CutInputHavingZeroDimFromConcat.py
+extensions/middle/DecomposeBias.py
+extensions/middle/DecomposeBidirectionalRNNSequence.py
+extensions/middle/Deconvolution3rdInputNormalization.py
+extensions/middle/DeleteControlFlowEdges.py
+extensions/middle/DeleteNotExecutable.py
+extensions/middle/DepthToSpace.py
+extensions/middle/DilatedConvolution.py
+extensions/middle/EltwiseChecker.py
+extensions/middle/EltwiseInputNormalization.py
+extensions/middle/EltwiseInputReshape.py
+extensions/middle/EmbeddingBagResolver.py
+extensions/middle/FakeSplitOutputs.py
+extensions/middle/FusedBatchNormNonConstant.py
+extensions/middle/FusedBatchNormTraining.py
+extensions/middle/FuseReshapesSequence.py
+extensions/middle/fusings.py
+extensions/middle/GatherNdNormalizer.py
+extensions/middle/GroupNorm.py
+extensions/middle/GRURNNSequenceToTensorIterator.py
+extensions/middle/InputCut.py
+extensions/middle/InsertLayoutPropagationTransposes.py
+extensions/middle/InsertSelect.py
+extensions/middle/InterpolateSequenceToInterpolate.py
+extensions/middle/L2NormToNorm.py
+extensions/middle/LayoutChangeForConstantShapePaths.py
+extensions/middle/LeakyReluPattern.py
+extensions/middle/LSTMRNNSequenceToTensorIterator.py
+extensions/middle/MinimumMiddleReplacer.py
+extensions/middle/MulAddToSS.py
+extensions/middle/MulFakeQuantizeFuse.py
+extensions/middle/MXNetRNNSequenceNormalize.py
+extensions/middle/MXNetSplitMultiLayers.py
+extensions/middle/MXTileReplacer.py
+extensions/middle/NasNet.py
+extensions/middle/ONNXRNNSequenceNormalize.py
+extensions/middle/PartialInfer.py
+extensions/middle/pass_separator.py
+extensions/middle/permute_tensor_iterator.py
+extensions/middle/preprocessing.py
+extensions/middle/quantize_fuses.py
+extensions/middle/ReluQuantizeFuse.py
+extensions/middle/RemoveDuplicationMemory.py
+extensions/middle/RemoveIdentity.py
+extensions/middle/RemoveRedundantReshapeAfterCropAndResize.py
+extensions/middle/RemoveRedundantReshapes.py
+extensions/middle/RemoveUselessConcatSplit.py
+extensions/middle/RemoveUselessCrops.py
+extensions/middle/RemoveUselessPad.py
+extensions/middle/ReplaceMemoryOffsetWithSplice.py
+extensions/middle/ReplacePNorm.py
+extensions/middle/ReplaceSpliceNodePattern.py
+extensions/middle/reverse_tensor_iterator.py
+extensions/middle/ReverseTransposeNormalization.py
+extensions/middle/ReverseV2ToReverseSequence.py
+extensions/middle/RNNSequenceNormalizeToIE.py
+extensions/middle/ScaleInput.py
+extensions/middle/SharedWeightsDuplication.py
+extensions/middle/SliceConverter.py
+extensions/middle/space_to_depth.py
+extensions/middle/sparse_reshape.py
+extensions/middle/ssd_anchors_to_const.py
+extensions/middle/SwapAxesMiddleReplacer.py
+extensions/middle/TensorIterator_utils.py
+extensions/middle/TensorIteratorBackEdge.py
+extensions/middle/TensorIteratorCondition.py
+extensions/middle/TensorIteratorConditionChecker.py
+extensions/middle/TensorIteratorInput.py
+extensions/middle/TensorIteratorLSTMToLSTMSequence.py
+extensions/middle/TensorIteratorMerge.py
+extensions/middle/TensorIteratorOutput.py
+extensions/middle/TF_lstm_cell_to_generic.py
+extensions/middle/UnsqueezeTileReshapeBlockToInterpolate.py
+extensions/middle/UpsampleToResample.py
+extensions/middle/UselessMerge.py
+extensions/middle/UselessSplitEraser.py
+extensions/middle/UselessStridedSlice.py
+extensions/middle/wights_permute_normalizer.py
+extensions/ops/__init__.py
+extensions/ops/accum.py
+extensions/ops/activation_ops.py
+extensions/ops/adaptive_avg_pooling.py
+extensions/ops/argmax.py
+extensions/ops/assert_op.py
+extensions/ops/aten.py
+extensions/ops/axpy.py
+extensions/ops/binarization.py
+extensions/ops/BlockLSTM.py
+extensions/ops/bn.py
+extensions/ops/box_nms.py
+extensions/ops/bucketize.py
+extensions/ops/Cast.py
+extensions/ops/constant_fill.py
+extensions/ops/copyop.py
+extensions/ops/correlation.py
+extensions/ops/ctc_greedy_decoder.py
+extensions/ops/data_augmentation.py
+extensions/ops/depth_to_space.py
+extensions/ops/DetectionOutput.py
+extensions/ops/detectionoutput_onnx.py
+extensions/ops/elementwise.py
+extensions/ops/embedding_bag.py
+extensions/ops/Enter.py
+extensions/ops/Exit.py
+extensions/ops/exp.py
+extensions/ops/fakequantize.py
+extensions/ops/gather.py
+extensions/ops/GatherNd.py
+extensions/ops/GatherTree.py
+extensions/ops/gelu.py
+extensions/ops/grn.py
+extensions/ops/GRU.py
+extensions/ops/GRUCell.py
+extensions/ops/hard_sigmoid.py
+extensions/ops/identity.py
+extensions/ops/instance_normalization.py
+extensions/ops/interp.py
+extensions/ops/interpolate.py
+extensions/ops/Log.py
+extensions/ops/LSTM.py
+extensions/ops/lstm_cell.py
+extensions/ops/lstm_sequence.py
+extensions/ops/MatMul.py
+extensions/ops/merge.py
+extensions/ops/mvn.py
+extensions/ops/mxrepeat.py
+extensions/ops/mxreshape.py
+extensions/ops/mxslice.py
+extensions/ops/NextIteration.py
+extensions/ops/non_max_suppression.py
+extensions/ops/non_zero.py
+extensions/ops/normalize.py
+extensions/ops/normalize_l2.py
+extensions/ops/one_hot.py
+extensions/ops/pack.py
+extensions/ops/parameter.py
+extensions/ops/pnorm.py
+extensions/ops/power_file.py
+extensions/ops/prediction_heatmap.py
+extensions/ops/prelu.py
+extensions/ops/priorbox.py
+extensions/ops/priorbox_clustered.py
+extensions/ops/priorgridgenerator_onnx.py
+extensions/ops/proposal.py
+extensions/ops/proposal_onnx.py
+extensions/ops/proposal_python_example.py
+extensions/ops/psroipooling.py
+extensions/ops/range.py
+extensions/ops/rank.py
+extensions/ops/ReduceOps.py
+extensions/ops/regionyolo.py
+extensions/ops/reorgyolo.py
+extensions/ops/resample.py
+extensions/ops/resize.py
+extensions/ops/resize_factor_utils.py
+extensions/ops/Reverse.py
+extensions/ops/reverse_sequence.py
+extensions/ops/RNN.py
+extensions/ops/RNNCell.py
+extensions/ops/roialign.py
+extensions/ops/roifeatureextractor_onnx.py
+extensions/ops/scatter.py
+extensions/ops/select.py
+extensions/ops/shufflechannel.py
+extensions/ops/simplernms.py
+extensions/ops/size.py
+extensions/ops/space_to_depth.py
+extensions/ops/sparse_fill_empty_rows.py
+extensions/ops/sparse_reshape.py
+extensions/ops/sparse_segment_mean.py
+extensions/ops/sparse_segment_sqrtn.py
+extensions/ops/sparse_segment_sum.py
+extensions/ops/sparse_to_dense.py
+extensions/ops/sparse_weighted_sum.py
+extensions/ops/spatial_transformer.py
+extensions/ops/splice.py
+extensions/ops/split.py
+extensions/ops/stop_gradient.py
+extensions/ops/swapaxis.py
+extensions/ops/switch.py
+extensions/ops/tensor_iterator.py
+extensions/ops/TensorArray.py
+extensions/ops/TensorArrayGather.py
+extensions/ops/TensorArrayRead.py
+extensions/ops/TensorArrayScatter.py
+extensions/ops/TensorArraySize.py
+extensions/ops/TensorArrayWrite.py
+extensions/ops/TensorIterator_ops.py
+extensions/ops/topk.py
+extensions/ops/topkrois_onnx.py
+extensions/ops/transpose.py
+extensions/ops/unique.py
+extensions/ops/upsample.py
+install_prerequisites/install_prerequisites.bat
+install_prerequisites/install_prerequisites.sh
+install_prerequisites/install_prerequisites_caffe.bat
+install_prerequisites/install_prerequisites_caffe.sh
+install_prerequisites/install_prerequisites_kaldi.bat
+install_prerequisites/install_prerequisites_kaldi.sh
+install_prerequisites/install_prerequisites_mxnet.bat
+install_prerequisites/install_prerequisites_mxnet.sh
+install_prerequisites/install_prerequisites_onnx.bat
+install_prerequisites/install_prerequisites_onnx.sh
+install_prerequisites/install_prerequisites_tf.bat
+install_prerequisites/install_prerequisites_tf.sh
+install_prerequisites/protobuf-3.6.1-py3.4-win-amd64.egg
+install_prerequisites/protobuf-3.6.1-py3.5-win-amd64.egg
+install_prerequisites/protobuf-3.6.1-py3.6-win-amd64.egg
+install_prerequisites/protobuf-3.6.1-py3.7-win-amd64.egg
+mo.py
+mo/__init__.py
+mo/back/__init__.py
+mo/back/ie_ir_ver_2/__init__.py
+mo/back/ie_ir_ver_2/emitter.py
+mo/back/replacement.py
+mo/front/__init__.py
+mo/front/caffe/__init__.py
+mo/front/caffe/collect_attributes.py
+mo/front/caffe/custom_layers_mapping.py
+mo/front/caffe/extractor.py
+mo/front/caffe/extractors/__init__.py
+mo/front/caffe/extractors/batchnorm.py
+mo/front/caffe/extractors/concat.py
+mo/front/caffe/extractors/crop.py
+mo/front/caffe/extractors/native_caffe.py
+mo/front/caffe/extractors/roipooling.py
+mo/front/caffe/extractors/scale.py
+mo/front/caffe/extractors/slice.py
+mo/front/caffe/extractors/tile.py
+mo/front/caffe/extractors/utils.py
+mo/front/caffe/loader.py
+mo/front/caffe/proto/__init__.py
+mo/front/caffe/proto/caffe_pb2.py
+mo/front/caffe/proto/generate_caffe_pb2.py
+mo/front/caffe/proto/mo_caffe.proto
+mo/front/caffe/python_layer_extractor.py
+mo/front/caffe/register_custom_ops.py
+mo/front/common/__init__.py
+mo/front/common/custom_replacement_registry.py
+mo/front/common/extractors/utils.py
+mo/front/common/find_unsupported_ops.py
+mo/front/common/layout.py
+mo/front/common/partial_infer/__init__.py
+mo/front/common/partial_infer/batch_norm.py
+mo/front/common/partial_infer/caffe_fallback.py
+mo/front/common/partial_infer/concat.py
+mo/front/common/partial_infer/crop.py
+mo/front/common/partial_infer/elemental.py
+mo/front/common/partial_infer/eltwise.py
+mo/front/common/partial_infer/multi_box_detection.py
+mo/front/common/partial_infer/multi_box_prior.py
+mo/front/common/partial_infer/random_uniform.py
+mo/front/common/partial_infer/reshape.py
+mo/front/common/partial_infer/roipooling.py
+mo/front/common/partial_infer/slice.py
+mo/front/common/partial_infer/utils.py
+mo/front/common/register_custom_ops.py
+mo/front/common/replacement.py
+mo/front/common/weights.py
+mo/front/extractor.py
+mo/front/kaldi/__init__.py
+mo/front/kaldi/extractor.py
+mo/front/kaldi/extractors/__init__.py
+mo/front/kaldi/extractors/add_ext.py
+mo/front/kaldi/extractors/add_shift_ext.py
+mo/front/kaldi/extractors/affine_component_ext.py
+mo/front/kaldi/extractors/affine_component_preconditioned_online_ext.py
+mo/front/kaldi/extractors/affine_transform_ext.py
+mo/front/kaldi/extractors/backproptruncation_ext.py
+mo/front/kaldi/extractors/batchnorm_component_ext.py
+mo/front/kaldi/extractors/clip_ext.py
+mo/front/kaldi/extractors/concat_ext.py
+mo/front/kaldi/extractors/convolutional_1d_component_ext.py
+mo/front/kaldi/extractors/convolutional_component_ext.py
+mo/front/kaldi/extractors/copy_ext.py
+mo/front/kaldi/extractors/crop_ext.py
+mo/front/kaldi/extractors/elementwise_component_ext.py
+mo/front/kaldi/extractors/fixed_affine_component_ext.py
+mo/front/kaldi/extractors/linear_component_ext.py
+mo/front/kaldi/extractors/lstm_nonlinearity_ext.py
+mo/front/kaldi/extractors/lstm_projected_streams_ext.py
+mo/front/kaldi/extractors/max_pooling_ext.py
+mo/front/kaldi/extractors/memoryoffset_ext.py
+mo/front/kaldi/extractors/naturalgradient_affine_component_ext.py
+mo/front/kaldi/extractors/noop_ext.py
+mo/front/kaldi/extractors/normalize_component_ext.py
+mo/front/kaldi/extractors/pnorm_component_ext.py
+mo/front/kaldi/extractors/rectified_linear_component_ext.py
+mo/front/kaldi/extractors/rescale_ext.py
+mo/front/kaldi/extractors/scale_component_ext.py
+mo/front/kaldi/extractors/slice_ext.py
+mo/front/kaldi/extractors/softmax_ext.py
+mo/front/kaldi/extractors/splice_component_ext.py
+mo/front/kaldi/loader/__init__.py
+mo/front/kaldi/loader/loader.py
+mo/front/kaldi/loader/utils.py
+mo/front/kaldi/register_custom_ops.py
+mo/front/kaldi/utils.py
+mo/front/mxnet/__init__.py
+mo/front/mxnet/extractor.py
+mo/front/mxnet/extractors/__init__.py
+mo/front/mxnet/extractors/add_n.py
+mo/front/mxnet/extractors/batchnorm.py
+mo/front/mxnet/extractors/concat.py
+mo/front/mxnet/extractors/l2_normalization.py
+mo/front/mxnet/extractors/multibox_prior.py
+mo/front/mxnet/extractors/relu.py
+mo/front/mxnet/extractors/scaleshift.py
+mo/front/mxnet/extractors/slice_axis.py
+mo/front/mxnet/extractors/utils.py
+mo/front/mxnet/loader.py
+mo/front/mxnet/nd_to_params.py
+mo/front/mxnet/register_custom_ops.py
+mo/front/onnx/__init__.py
+mo/front/onnx/extractor.py
+mo/front/onnx/extractors/__init__.py
+mo/front/onnx/extractors/concat.py
+mo/front/onnx/extractors/eltwise.py
+mo/front/onnx/extractors/fused_bn.py
+mo/front/onnx/extractors/reshape.py
+mo/front/onnx/extractors/utils.py
+mo/front/onnx/loader.py
+mo/front/onnx/register_custom_ops.py
+mo/front/subgraph_matcher.py
+mo/front/tf/__init__.py
+mo/front/tf/common.py
+mo/front/tf/custom_subgraph_call.py
+mo/front/tf/extractor.py
+mo/front/tf/extractors/__init__.py
+mo/front/tf/extractors/concat.py
+mo/front/tf/extractors/fused_bn.py
+mo/front/tf/extractors/identity.py
+mo/front/tf/extractors/native_tf.py
+mo/front/tf/extractors/pack.py
+mo/front/tf/extractors/random_uniform.py
+mo/front/tf/extractors/strided_slice.py
+mo/front/tf/extractors/utils.py
+mo/front/tf/graph_utils.py
+mo/front/tf/loader.py
+mo/front/tf/partial_infer/__init__.py
+mo/front/tf/partial_infer/tf.py
+mo/front/tf/register_custom_ops.py
+mo/front/tf/replacement.py
+mo/graph/__init__.py
+mo/graph/connection.py
+mo/graph/graph.py
+mo/graph/perm_inputs.py
+mo/graph/port.py
+mo/main.py
+mo/middle/__init__.py
+mo/middle/passes/__init__.py
+mo/middle/passes/conv.py
+mo/middle/passes/convert_data_type.py
+mo/middle/passes/debug.py
+mo/middle/passes/eliminate.py
+mo/middle/passes/fusing/__init__.py
+mo/middle/passes/fusing/decomposition.py
+mo/middle/passes/fusing/fuse_grouped_conv.py
+mo/middle/passes/fusing/fuse_linear_ops.py
+mo/middle/passes/fusing/fuse_linear_seq.py
+mo/middle/passes/fusing/helpers.py
+mo/middle/passes/fusing/mark_unfused_nodes.py
+mo/middle/passes/fusing/resnet_optimization.py
+mo/middle/passes/infer.py
+mo/middle/passes/leaky_relu.py
+mo/middle/passes/mean_scale_values.py
+mo/middle/passes/tensor_names.py
+mo/middle/pattern_match.py
+mo/middle/replacement.py
+mo/ops/__init__.py
+mo/ops/activation.py
+mo/ops/broadcast.py
+mo/ops/clamp.py
+mo/ops/concat.py
+mo/ops/const.py
+mo/ops/constant_of_shape.py
+mo/ops/convolution.py
+mo/ops/crop.py
+mo/ops/deconvolution.py
+mo/ops/deformable_convolution.py
+mo/ops/eltwise.py
+mo/ops/eltwise_n.py
+mo/ops/eltwise_ninputs_in_1.py
+mo/ops/expand_dims.py
+mo/ops/fill.py
+mo/ops/flatten.py
+mo/ops/group_norm.py
+mo/ops/lrn.py
+mo/ops/lstmnonlinearity.py
+mo/ops/memory.py
+mo/ops/memoryoffset.py
+mo/ops/op.py
+mo/ops/pad.py
+mo/ops/permute.py
+mo/ops/pooling.py
+mo/ops/power.py
+mo/ops/reshape.py
+mo/ops/result.py
+mo/ops/roipooling.py
+mo/ops/scale_shift.py
+mo/ops/shape.py
+mo/ops/slice.py
+mo/ops/softmax.py
+mo/ops/space_to_batch.py
+mo/ops/squeeze.py
+mo/ops/strided_slice.py
+mo/ops/tile.py
+mo/ops/unsqueeze.py
+mo/pipeline/__init__.py
+mo/pipeline/common.py
+mo/pipeline/unified.py
+mo/utils/__init__.py
+mo/utils/class_registration.py
+mo/utils/cli_parser.py
+mo/utils/custom_replacement_config.py
+mo/utils/dsu.py
+mo/utils/error.py
+mo/utils/find_inputs.py
+mo/utils/graph.py
+mo/utils/guess_framework.py
+mo/utils/import_extensions.py
+mo/utils/ir_engine/__init__.py
+mo/utils/ir_engine/compare_graphs.py
+mo/utils/ir_engine/ir_engine.py
+mo/utils/ir_reader/__init__.py
+mo/utils/ir_reader/extender.py
+mo/utils/ir_reader/extenders/binary_convolution_extender.py
+mo/utils/ir_reader/extenders/conv_extender.py
+mo/utils/ir_reader/extenders/convert_extender.py
+mo/utils/ir_reader/extenders/deconvolution_extender.py
+mo/utils/ir_reader/extenders/deformable_convolution_extender.py
+mo/utils/ir_reader/extenders/experimental_extender.py
+mo/utils/ir_reader/extenders/fakequantize_extender.py
+mo/utils/ir_reader/extenders/GRUCell_extender.py
+mo/utils/ir_reader/extenders/interpolate_extender.py
+mo/utils/ir_reader/extenders/LSTMCell_extender.py
+mo/utils/ir_reader/extenders/non_zero_extender.py
+mo/utils/ir_reader/extenders/pad_extender.py
+mo/utils/ir_reader/extenders/parameter_extender.py
+mo/utils/ir_reader/extenders/pooling_extender.py
+mo/utils/ir_reader/extenders/priorbox_clustered_extender.py
+mo/utils/ir_reader/extenders/priorbox_extender.py
+mo/utils/ir_reader/extenders/reorg_yolo_extender.py
+mo/utils/ir_reader/extenders/RNNCell_extender.py
+mo/utils/ir_reader/extenders/strided_slice_extender.py
+mo/utils/ir_reader/extenders/tensoriterator_extender.py
+mo/utils/ir_reader/extenders/topk_extender.py
+mo/utils/ir_reader/extenders/variadic_split_extender.py
+mo/utils/ir_reader/layer_to_class.py
+mo/utils/ir_reader/restore_graph.py
+mo/utils/logger.py
+mo/utils/model_analysis.py
+mo/utils/pipeline_config.py
+mo/utils/replacement_pattern.py
+mo/utils/shape.py
+mo/utils/simple_proto_parser.py
+mo/utils/str_to.py
+mo/utils/summarize_graph.py
+mo/utils/tensorboard_util.py
+mo/utils/unsupported_ops.py
+mo/utils/utils.py
+mo/utils/version.py
+mo/utils/versions_checker.py
+mo_caffe.py
+mo_kaldi.py
+mo_mxnet.py
+mo_onnx.py
+mo_tf.py
+requirements.txt
+requirements_caffe.txt
+requirements_kaldi.txt
+requirements_mxnet.txt
+requirements_onnx.txt
+requirements_tf.txt
diff --git a/model-optimizer/automation/utils.py b/model-optimizer/automation/utils.py
new file mode 100644 (file)
index 0000000..9045282
--- /dev/null
@@ -0,0 +1,47 @@
+import os
+import subprocess
+import tarfile
+from datetime import datetime
+from shutil import copy, copytree, rmtree
+
+
+
+class Automation:
+    @staticmethod
+    def parse_bom(bom_path):
+        files = []
+        for file in open(bom_path):
+            files.append(file)
+        return files
+
+    @staticmethod
+    def copy_files_from_bom(root_path, bom):
+        target_dir = os.path.join(os.path.dirname(__file__), "ModelOptimizerForTensorflow")
+        if os.path.exists(target_dir):
+            rmtree(target_dir)
+        os.makedirs(target_dir)
+        for file in bom:
+            src = os.path.join(root_path, file.strip('\n'))
+            dst = os.path.join(target_dir, file.strip('\n'))
+            if not os.path.exists(os.path.dirname(dst)):
+                os.makedirs(os.path.dirname(dst))
+            if os.path.isdir(src):
+                copytree(src, dst)
+            else:
+                copy(src, dst)
+        return target_dir
+
+    @staticmethod
+    def add_version_txt(dst_path, build_number):
+        timestamp = datetime.now().strftime("%I:%M%p %B %d, %Y")
+        with open(os.path.join(dst_path, "version.txt"), 'w') as f:
+            f.write(timestamp + '\n')
+            f.write(build_number + '\n')
+
+    @staticmethod
+    def make_tarfile(out_file_name, source_dir):
+        archive_path = os.path.join(os.path.dirname(__file__), out_file_name)
+        if os.path.exists(archive_path):
+            os.remove(archive_path)
+        with tarfile.open(out_file_name, "w:gz") as tar:
+            tar.add(source_dir, arcname=os.path.basename(source_dir))
index 7c9ce93..b4e6de4 100644 (file)
  limitations under the License.
 """
 
+import numpy as np
+
 from mo.graph.graph import Graph
 from mo.utils.model_analysis import AnalyzeAction
-import numpy as np
 
 
 class TrainingPhaseAnalysis(AnalyzeAction):
index 86809e2..6079918 100644 (file)
@@ -13,9 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import logging as log
-
 import json
+import logging as log
 import sys
 
 import numpy as np
index 1ed66cb..976511f 100644 (file)
@@ -16,7 +16,7 @@
 import logging as log
 
 from mo.graph.graph import Graph
-from mo.utils.model_analysis import AnalyzeAction, graph_contains_scope, AnalysisResults
+from mo.utils.model_analysis import AnalyzeAction, graph_contains_scope
 from mo.utils.utils import files_by_pattern, get_mo_root_dir
 
 
index 13b9c0b..d287afb 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+import unittest
+
 import numpy as np
 
-import unittest
 from extensions.back.CutMemory import CutMemory
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class CutMemoryTest(unittest.TestCase):
index 16cfb5e..3eb526d 100644 (file)
@@ -15,6 +15,7 @@
 """
 
 import logging as log
+
 import numpy as np
 
 from extensions.back.ForceStrictPrecision import ForceStrictPrecision
index 5165092..073b88c 100644 (file)
 """
 
 import numpy as np
+
 from extensions.ops.split import VariadicSplit
-from mo.front.tf.graph_utils import create_op_node_with_second_input
-from mo.front.common.partial_infer.utils import int64_array
 from mo.back.replacement import BackReplacementPattern
+from mo.front.common.partial_infer.utils import int64_array
+from mo.front.tf.graph_utils import create_op_node_with_second_input
 from mo.graph.graph import Graph
-from mo.ops.reshape import Reshape
 from mo.ops.const import Const
+from mo.ops.reshape import Reshape
 
 
 class LSTMCellNormalizer(BackReplacementPattern):
index a178c74..6b898af 100644 (file)
 """
 
 import logging as log
-
 import math
+
 import numpy as np
 
-from extensions.middle.FuseReshapesSequence import FuseReshapesSequence
 from extensions.back.FuseTransposesSequence import FuseTransposesSequence
+from extensions.middle.FuseReshapesSequence import FuseReshapesSequence
 from extensions.middle.RemoveRedundantReshapes import RemoveRedundantReshapes
 from mo.back.replacement import BackReplacementPattern
 from mo.front.common.partial_infer.utils import int64_array
index 7ef6255..1017ef4 100644 (file)
@@ -14,6 +14,7 @@
  limitations under the License.
 """
 import logging as log
+
 import numpy as np
 
 from extensions.back.ReshapeMutation import ReshapeMutation
index cd4b9e3..57c233b 100644 (file)
@@ -20,7 +20,7 @@ from generator import generator, generate
 
 from extensions.back.ReduceToPooling import ReduceReplacer, ReduceMerge
 from mo.front.common.partial_infer.utils import int64_array
-from mo.middle.passes.eliminate import shape_inference, eliminate_dead_nodes
+from mo.middle.passes.eliminate import shape_inference
 from mo.middle.passes.eliminate_test import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
 
index d3eda9b..f69c0e3 100644 (file)
@@ -18,8 +18,8 @@ import logging as log
 
 import numpy as np
 
-from mo.back.replacement import BackReplacementPattern
 from extensions.back.OptimizeTransposeReshapeSequence import set_reshape_new_output_shape
+from mo.back.replacement import BackReplacementPattern
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Graph
 
index 9021030..bf98865 100644 (file)
@@ -14,6 +14,7 @@
  limitations under the License.
 """
 import logging as log
+
 import numpy as np
 
 from extensions.back.ForceStrictPrecision import ForceStrictPrecision
index bfc4330..d03f958 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import numpy as np
 
 from mo.back.replacement import BackReplacementPattern
-from mo.graph.graph import Graph, Node
-from mo.middle.pattern_match import for_each_sub_graph_recursively
+from mo.graph.graph import Graph
 
 
 class ShapeOfToShape(BackReplacementPattern):
index 3ec9ad4..3982172 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.back.ShufflenetReLUReorder import ShufflenetReLUReorder
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 # The dictionary with nodes attributes used to build various graphs. A key is the name of the node and the value is the
 # dictionary with node attributes.
index 543bf17..09422d3 100644 (file)
@@ -20,10 +20,8 @@ from copy import copy
 import numpy as np
 
 from extensions.back.pass_separator import BackFinish
-from extensions.ops.split import Split
 from extensions.ops.tensor_iterator import TensorIterator, get_internal_node_by_layer_id
 from mo.back.replacement import BackReplacementPattern
-from mo.front.tf.graph_utils import create_op_node_with_second_input
 from mo.graph.graph import Graph
 from mo.ops.const import Const
 from mo.utils.error import Error
index af9b7e1..c5f5888 100644 (file)
  limitations under the License.
 """
 import unittest
+
 import numpy as np
+
 from extensions.back.SpecialNodesFinalization import CreateConstNodesReplacement
-from mo.utils.unittest.graph import build_graph_with_attrs
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
 
 
 class CreateConstNodesReplacementTest(unittest.TestCase):
index 81aa824..0c63a78 100644 (file)
@@ -13,7 +13,6 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import numpy as np
 
 from extensions.back.ConvolutionNormalizer import DeconvolutionNormalizer
 from extensions.back.CropToStridedSlice import CropToStridedSlice
index b58c264..bb00513 100644 (file)
@@ -20,8 +20,8 @@ import numpy as np
 from extensions.back.TileNormalizer import TileMultipleAxisReplacer, Tile3DReshaper
 from mo.front.common.partial_infer.utils import int64_array
 from mo.ops.tile import Tile
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class TileMultipleAxisReplacerTest(unittest.TestCase):
index 034b9bb..d0dd194 100644 (file)
 """
 
 import unittest
-import numpy as np
 from argparse import Namespace
 
+import numpy as np
+
 from extensions.back.compress_quantized_weights import CompressQuantizeWeights
 from extensions.ops.fakequantize import FakeQuantize
-from mo.ops.const import Const
 from mo.front.common.partial_infer.eltwise import eltwise_infer
 from mo.graph.graph import Node
+from mo.ops.const import Const
 from mo.utils.ir_engine.compare_graphs import compare_graphs
 from mo.utils.unittest.graph import build_graph
 
index 464ca85..87dc7e7 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
-
 from mo.back.replacement import BackReplacementPattern
 from mo.graph.graph import Node, Graph
 from mo.utils.error import Error
index bb2d0d2..a09544e 100644 (file)
 """
 
 import numpy as np
-import networkx as nx
 
-from mo.ops.const import Const
-from mo.ops.op import Op
-from mo.graph.graph import Graph
 from mo.back.replacement import BackReplacementPattern
+from mo.graph.graph import Graph
+from mo.ops.const import Const
 
 
 class CompatibilityL2NormalizationPattern(BackReplacementPattern):
index 8b28cec..d633a59 100644 (file)
@@ -15,6 +15,7 @@
 """
 
 import logging as log
+
 import numpy as np
 
 from mo.front.common.replacement import FrontReplacementSubgraph
index 78c25dc..c285510 100644 (file)
 """
 
 import logging as log
+
 import numpy as np
 
-from mo.front.common.replacement import FrontReplacementPattern
 from mo.front.common.partial_infer.utils import int64_array
+from mo.front.common.replacement import FrontReplacementPattern
 from mo.graph.graph import Graph
 from mo.ops.const import Const
 from mo.ops.unsqueeze import Unsqueeze
index 8b44dd5..765c431 100644 (file)
@@ -16,6 +16,7 @@
 
 import logging as log
 from math import sqrt, fabs
+
 from extensions.ops.gelu import GeLUOP
 from mo.front.common.replacement import FrontReplacementSubgraph
 from mo.graph.graph import Graph
index b2a5c11..46d7dbb 100644 (file)
@@ -16,9 +16,9 @@
 
 import numpy as np
 
+from extensions.ops.elementwise import Mul
 from mo.front.common.replacement import FrontReplacementOp
 from mo.graph.graph import Graph
-from extensions.ops.elementwise import Mul
 from mo.ops.const import Const
 
 
index cd1b8d0..6056184 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.LRNReplacer import LRNReplacer
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index e14ba92..98a4c9f 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.Log1p import Log1p
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder': {'shape': np.array([4, 5, 6]), 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
 """
 from extensions.ops.Log import LogOp
 from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Graph, Node
+from mo.graph.graph import Graph, Node, rename_nodes
 from mo.ops.softmax import Softmax
 
 
 class LogSoftmaxFrontReplacer(FrontReplacementOp):
     """
-    Replace LogSoftmax operation by Softmax -> Log.
+    Replace LogSoftmax operation with Softmax -> Log.
     """
     op = "LogSoftmax"
     enabled = True
 
     def replace_op(self, graph: Graph, node: Node):
-        axis = -1
-        if 'axis' in node.pb.attr:
-            axis = node.pb.attr['axis'].i
+        node_name = node.soft_get('name', node.id)
+        assert node.has_valid('axis'), 'The node "{}" does not have mandatory attribute "axis"'.format(node_name)
 
-        log = LogOp(graph, {'name': node.name + '/Log_'}).create_node()
-        softmax = Softmax(graph, {'axis': axis, 'name': node.name + '/SoftMax_'}).create_node()
+        log = LogOp(graph, {}).create_node()
+        softmax = Softmax(graph, {'axis': node.axis, 'name': node_name + '/Softmax'}).create_node()
+        rename_nodes([(node, node_name + '/delete'), (log, node_name)])
 
         # Connect nodes: input -> Softmax -> Log
         node.in_port(0).get_connection().set_destination(softmax.in_port(0))
         log.in_port(0).get_connection().set_source(softmax.out_port(0))
-
-        # The "explicit" version of the return value is: [(out_node.id, 0)])
         return [log.id]
diff --git a/model-optimizer/extensions/front/LogSoftmax_test.py b/model-optimizer/extensions/front/LogSoftmax_test.py
new file mode 100644 (file)
index 0000000..18bea40
--- /dev/null
@@ -0,0 +1,81 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import unittest
+
+from extensions.front.LogSoftmax import LogSoftmaxFrontReplacer
+from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph, regular_op, result, connect
+
+nodes = {
+    **regular_op('input', {'type': 'Parameter'}),
+    **regular_op('logsoftmax', {'type': None, 'op': 'LogSoftmax', 'axis': -2, 'name': 'my_logsoftmax'}),
+    **result('output'),
+}
+edges = [
+    ('input', 'logsoftmax'),
+    ('logsoftmax', 'output'),
+]
+
+
+class LogSoftmaxReplacerTest(unittest.TestCase):
+    def test_1(self):
+        graph = build_graph(nodes, edges)
+
+        graph_ref = build_graph({
+            **regular_op('input', {'type': 'Parameter'}),
+            **regular_op('log', {'op': 'Log', 'type': 'Log'}),
+            **regular_op('softmax', {'op': 'SoftMax', 'type': 'SoftMax', 'axis': -2}),
+            **result('output'),
+        },
+            [
+                ('input', 'softmax'),
+                ('softmax', 'log'),
+                ('log', 'output'),
+            ])
+
+        graph.graph['layout'] = 'NCHW'
+        graph.stage = 'front'
+
+        LogSoftmaxFrontReplacer().find_and_replace_pattern(graph)
+
+        (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+        self.assertTrue(flag, resp)
+        self.assertTrue(graph.get_op_nodes(op='Log')[0].name == 'my_logsoftmax')
+
+    def test_2(self):
+        graph = build_graph(nodes, edges)
+
+        graph_ref = build_graph({
+            **regular_op('input', {'type': 'Parameter'}),
+            **regular_op('log', {'op': 'Log', 'type': 'Log'}),
+            **regular_op('softmax', {'op': 'SoftMax', 'type': 'SoftMax', 'axis': -2}),
+            **result('output'),
+        },
+            [
+                ('input', 'softmax'),
+                ('softmax', 'log'),
+                ('log', 'output'),
+            ])
+
+        graph.graph['layout'] = 'NHWC'
+        graph.stage = 'front'
+
+        LogSoftmaxFrontReplacer().find_and_replace_pattern(graph)
+
+        (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+        self.assertTrue(flag, resp)
+        self.assertTrue(graph.get_op_nodes(op='Log')[0].name == 'my_logsoftmax')
index a86bafa..3509f5a 100644 (file)
@@ -14,6 +14,7 @@
  limitations under the License.
 """
 import math
+
 import numpy as np
 
 from extensions.ops.MatMul import MatMul
index 37d4126..663d1ce 100644 (file)
 import unittest
 
 import numpy as np
+from generator import generator, generate
 
 from extensions.front.Pack import Pack
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
-
-from generator import generator, generate
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_0': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index f44672e..bec2383 100644 (file)
@@ -17,8 +17,8 @@
 import logging as log
 
 from mo.front.common.partial_infer.utils import int64_array
-from mo.graph.graph import Graph
 from mo.front.common.replacement import FrontReplacementPattern
+from mo.graph.graph import Graph
 from mo.ops.const import Const
 from mo.utils.error import Error
 
index fe825ef..0692bd1 100644 (file)
@@ -20,7 +20,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.front.common.replacement import FrontReplacementPattern
 from mo.graph.graph import Graph
 from mo.ops.const import Const
-from mo.ops.result import Result
 
 
 class TopKNormalize(FrontReplacementPattern):
index 8dfe8ba..55c4ad7 100644 (file)
 
 import numpy as np
 
+from extensions.ops.elementwise import Add, Mul
 from mo.front.common.replacement import FrontReplacementPattern
 from mo.graph.graph import Graph
 from mo.ops.const import Const
-from extensions.ops.elementwise import Add, Mul
 
 
 class BinaryFakeQuantizeNormalization(FrontReplacementPattern):
index ee1501c..fa0d9cd 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.binary_quantize_normalization import BinaryFakeQuantizeNormalization
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 graph_nodes = {
     '0': {'name': 'input', 'kind': 'op', 'op': 'Parameter'},
index 252f266..817179c 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.accum_ext import AccumFrontExtractor
 from extensions.ops.accum import AccumOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeAccumProtoLayer:
index 2674fdc..b748eae 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.argmax_ext import ArgMaxFrontExtractor
 from extensions.ops.argmax import ArgMaxOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeArgMaxProtoLayer:
index dd9d8de..22ac09f 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
+from extensions.ops.elementwise import Add
 from mo.front.common.replacement import FrontReplacementOp
 from mo.graph.graph import Node, Graph
-from extensions.ops.elementwise import Add
 from mo.ops.scale_shift import ScaleShiftOp
 
 
index 1958f4a..fd899d8 100644 (file)
@@ -19,9 +19,9 @@ import numpy as np
 
 from extensions.front.caffe.bn import BNToScaleShift
 from mo.graph.graph import Node
+from mo.utils.ir_engine.compare_graphs import compare_graphs
 from mo.utils.unittest.extractors import FakeParam
 from mo.utils.unittest.graph import build_graph_with_edge_attrs, build_graph_with_attrs
-from mo.utils.ir_engine.compare_graphs import compare_graphs
 
 
 class FakeBNProtoLayer:
index 469c1bc..8bb1943 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.correlation_ext import CorrelationFrontExtractor
 from extensions.ops.correlation import CorrelationOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeCorrProtoLayer:
index d882758..25bf849 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.ctcgreedydecoder_ext import CTCGreedyDecoderFrontExtractor
 from extensions.ops.ctc_greedy_decoder import CTCGreedyDecoderOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeCTCGreedyDecoderProtoLayer:
index 0f36668..33b4ef2 100644 (file)
@@ -21,9 +21,9 @@ import numpy as np
 
 from extensions.front.caffe.data_augmentation_ext import DataAugmentationFrontExtractor
 from extensions.ops.data_augmentation import DataAugmentationOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeDAProtoLayer:
index a799d55..eee09bc 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
+from extensions.ops.activation_ops import Elu
 from mo.front.caffe.collect_attributes import collect_attributes
 from mo.front.extractor import FrontExtractorOp
-from extensions.ops.activation_ops import Elu
 
 
 class ELUFrontExtractor(FrontExtractorOp):
index cc3d64e..dc30e54 100644 (file)
@@ -15,7 +15,6 @@
 """
 
 import unittest
-
 from unittest.mock import patch
 
 from extensions.front.caffe.elu import ELUFrontExtractor
index 8aa502f..0499be2 100644 (file)
@@ -19,10 +19,10 @@ from unittest.mock import patch
 
 from extensions.front.caffe.grn_ext import GRNFrontExtractor
 from extensions.ops.grn import GRNOp
-from mo.utils.unittest.extractors import FakeMultiParam
-from mo.utils.unittest.graph import FakeNode
 from mo.front.common.partial_infer.elemental import copy_shape_infer
 from mo.ops.op import Op
+from mo.utils.unittest.extractors import FakeMultiParam
+from mo.utils.unittest.graph import FakeNode
 
 
 class FakeGRNProtoLayer:
index 40e679e..ed76fcd 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.normalize_ext import NormalizeFrontExtractor
 from extensions.ops.normalize import NormalizeOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeNormalizeProtoLayer:
index fe4366f..be4ace6 100644 (file)
@@ -19,10 +19,10 @@ from unittest.mock import patch
 
 from extensions.front.caffe.power_file_ext import PowerFileFrontExtractor
 from extensions.ops.power_file import PowerFileOp
-from mo.utils.unittest.extractors import FakeMultiParam
-from mo.utils.unittest.graph import FakeNode
 from mo.front.common.partial_infer.elemental import copy_shape_infer
 from mo.ops.op import Op
+from mo.utils.unittest.extractors import FakeMultiParam
+from mo.utils.unittest.graph import FakeNode
 
 
 class FakePowerFileProtoLayer:
index 453a9a0..ce25828 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.prelu_ext import PreluFrontExtractor
 from extensions.ops.prelu import PreluOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakePReLUProtoLayer:
index e8f3e73..f3ba679 100644 (file)
@@ -21,9 +21,9 @@ import numpy as np
 
 from extensions.front.caffe.priorbox_clustered_ext import PriorBoxClusteredFrontExtractor
 from extensions.ops.priorbox_clustered import PriorBoxClusteredOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakePriorBoxClusteredProtoLayer:
index e6af49b..db1d8b0 100644 (file)
@@ -21,9 +21,9 @@ import numpy as np
 
 from extensions.front.caffe.priorbox_ext import PriorBoxFrontExtractor
 from extensions.ops.priorbox import PriorBoxOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam, FakeParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeMultiParamListFields(FakeMultiParam):
index 673df1d..051ea60 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.proposal_ext import ProposalFrontExtractor
 from extensions.ops.proposal import ProposalOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode, FakeAttr
-from mo.ops.op import Op
 
 
 class FakeProposalProtoLayer:
index 2e41950..76c8297 100644 (file)
 """
 
 import unittest
-from unittest.mock import patch
 
 from extensions.front.caffe.proposal_python_ext import ProposalPythonFrontExtractor
 from extensions.ops.proposal import ProposalOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode, FakeAttr
-from mo.ops.op import Op
 
 
 class FakeProposalPythonProtoLayer:
index 00e0224..87db9dc 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.regionyolo_ext import RegionYoloFrontExtractor
 from extensions.ops.regionyolo import RegionYoloOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeRegionYoloProtoLayer:
index f77b997..bf6c29f 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.extractor import FrontExtractorOp
 from extensions.ops.activation_ops import ReLU6
+from mo.front.extractor import FrontExtractorOp
 
 
 class ReLU6FrontExtractor(FrontExtractorOp):
index bff6b80..3775eb9 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.reorgyolo_ext import ReorgYoloFrontExtractor
 from extensions.ops.reorgyolo import ReorgYoloOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeReorgYoloProtoLayer:
index ade38d0..834e0ca 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.simplernms_ext import SimplerNMSFrontExtractor
 from extensions.ops.simplernms import SimplerNMSOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeSimplerNMSProtoLayer:
index 1cec6b1..8246e07 100644 (file)
@@ -19,9 +19,9 @@ from unittest.mock import patch
 
 from extensions.front.caffe.spatial_transformer_ext import SpatialTransformFrontExtractor
 from extensions.ops.spatial_transformer import SpatialTransformOp
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import FakeNode
-from mo.ops.op import Op
 
 
 class FakeSpatialTransformProtoLayer:
index 0811fad..668eb91 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.extractor import FrontExtractorOp
 from extensions.ops.activation_ops import Tanh
+from mo.front.extractor import FrontExtractorOp
 
 
 class TanhFrontExtractor(FrontExtractorOp):
index b377072..55800f4 100644 (file)
@@ -15,9 +15,9 @@
 """
 
 
+from extensions.ops.elementwise import Add, Maximum, Mul
 from mo.front.common.replacement import FrontReplacementOp
 from mo.graph.graph import Node, Graph
-from extensions.ops.elementwise import Add, Maximum, Mul
 
 
 class EltwiseNReplacement(FrontReplacementOp):
index 21bcb02..dd0c5fd 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.eltwise_n import EltwiseNReplacement
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index ac94760..bb92472 100644 (file)
@@ -22,10 +22,8 @@ from extensions.ops.range import Range
 from extensions.ops.rank import Rank
 from mo.front.common.partial_infer.utils import int64_array
 from mo.front.common.replacement import FrontReplacementPattern
-from mo.front.tf.graph_utils import create_op_node_with_second_input
 from mo.graph.graph import Graph
 from mo.ops.const import Const
-from mo.ops.unsqueeze import Unsqueeze
 
 
 class GlobalPoolingToReduce(FrontReplacementPattern):
index 6aadf80..dc18baa 100644 (file)
 
 import numpy as np
 
+from extensions.ops.elementwise import Mul, Add
 from mo.front.common.replacement import FrontReplacementOp
 from mo.graph.graph import Graph
 from mo.ops.const import Const
-from extensions.ops.elementwise import Mul, Add
 
 
 class ImageScaler(FrontReplacementOp):
index 52776cc..8c71426 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.image_scaler import ImageScaler
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index 1794096..8a22b7b 100644 (file)
  limitations under the License.
 """
 
-from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Node, Graph
 from extensions.ops.elementwise import Add, Mul
 from extensions.ops.mvn import MVN
+from mo.front.common.replacement import FrontReplacementOp
+from mo.graph.graph import Node, Graph
 
 
 class InstanceNormalization(FrontReplacementOp):
index bf3b26d..26814a2 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import networkx as nx
 
 from extensions.front.instance_normalization import InstanceNormalization
-from mo.utils.unittest.graph import build_graph
 from mo.middle.pattern_match import node_match
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'input': {'kind': 'op', 'op': 'AnyOp'},
index abeda4a..e654a65 100644 (file)
@@ -17,6 +17,7 @@
 import unittest
 
 import numpy as np
+
 from extensions.front.kaldi.apply_counts import apply_biases_to_last_layer
 from mo.utils.ir_engine.compare_graphs import compare_graphs
 from mo.utils.unittest.graph import build_graph
diff --git a/model-optimizer/extensions/front/kaldi/logsoftmax.py b/model-optimizer/extensions/front/kaldi/logsoftmax.py
deleted file mode 100644 (file)
index 1cda00d..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-"""
- Copyright (C) 2018-2020 Intel Corporation
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-from extensions.ops.Log import LogOp
-from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Graph, Node
-from mo.ops.softmax import Softmax
-
-
-class LogsoftmaxFrontReplacer(FrontReplacementOp):
-    """
-    Replace LogSoftmax operation by Softmax -> Log.
-    """
-    op = "logsoftmaxcomponent"
-    enabled = True
-
-    def replace_op(self, graph: Graph, node: Node):
-        log = LogOp(graph, {'name': node.name + '/Log_'}).create_node()
-        softmax = Softmax(graph, {'axis': 1, 'name': node.name + '/SoftMax_'}).create_node()
-
-        # Connect nodes: input -> Softmax -> Log
-        node.in_port(0).get_connection().set_destination(softmax.in_port(0))
-        log.in_port(0).get_connection().set_source(softmax.out_port(0))
-
-        # The "explicit" version of the return value is: [(out_node.id, 0)])
-        return [log.id]
diff --git a/model-optimizer/extensions/front/kaldi/logsoftmax_component_ext.py b/model-optimizer/extensions/front/kaldi/logsoftmax_component_ext.py
new file mode 100644 (file)
index 0000000..8d4ddc6
--- /dev/null
@@ -0,0 +1,28 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.ops.softmax import LogSoftmax
+from mo.front.extractor import FrontExtractorOp
+
+
+class LogSoftMaxComponentExtractor(FrontExtractorOp):
+    op = 'logsoftmaxcomponent'
+    enabled = True
+
+    @classmethod
+    def extract(cls, node):
+        LogSoftmax.update_node_stat(node, {'axis': 1})
+        return cls.enabled
index f79f6d1..2c7579f 100644 (file)
@@ -18,7 +18,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.front.common.replacement import FrontReplacementOp
 from mo.front.tf.graph_utils import create_op_with_const_inputs
 from mo.graph.graph import Node, Graph
-from mo.ops.const import Const
 from mo.ops.eltwise import Eltwise
 from mo.ops.eltwise_n import EltwiseN
 from mo.utils.error import Error
index 882fbfa..8d30458 100644 (file)
@@ -15,8 +15,8 @@
 """
 
 from extensions.front.kaldi.sigmoid_ext import SigmoidFrontExtractor
-from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
 from extensions.ops.activation_ops import Sigmoid
+from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
 from mo.ops.op import Op
 
 
index 7f0195e..5f8df79 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
-from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
 from extensions.front.kaldi.tanh_component_ext import TanhFrontExtractor
 from extensions.ops.activation_ops import Tanh
+from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
 from mo.ops.op import Op
 
 
index 2115cca..f5f236e 100644 (file)
 """
 
 import unittest
+from argparse import Namespace
 
 import numpy as np
-from argparse import Namespace
 
-from mo.graph.graph import Node
 from extensions.front.mxnet.add_input_data_to_prior_boxes import AddInputDataToPriorBoxes
+from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
 
index bbe0e61..7a16959 100644 (file)
@@ -17,8 +17,8 @@
 import unittest
 
 from extensions.front.mxnet.check_softmax_node_inputs import CheckSoftmaxNodeInputs
-from mo.utils.unittest.graph import build_graph
 from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
 
 
 class TestCheckSoftmaxNodeInputs(unittest.TestCase):
index ac42795..0b02c2d 100644 (file)
@@ -19,7 +19,7 @@ import numpy as np
 from mo.front.extractor import FrontExtractorOp
 from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.convolution import Convolution
-from mo.front.common.extractors.utils import layout_attrs
+
 
 class ConvFrontExtractor(FrontExtractorOp):
     op = 'Convolution'
index fda4174..47d69d9 100644 (file)
@@ -17,9 +17,9 @@
 import unittest
 
 from extensions.front.mxnet.custom import CustomFrontExtractorOp
-from mo.utils.unittest.graph import build_graph
 from mo.front.extractor import FrontExtractorOp, MXNetCustomFrontExtractorOp
 from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
 
 attrs = {'test_attr': 1}
 
index 81e8411..ee43443 100644 (file)
@@ -15,7 +15,8 @@
 """
 import numpy as np
 
-from extensions.ops.elementwise import Mul, Sub, Add, Maximum, Minimum, Div, Greater, GreaterEqual, Equal, Less, LessEqual, Pow, NotEqual, LogicalAnd, LogicalOr
+from extensions.ops.elementwise import Mul, Sub, Add, Maximum, Minimum, Div, Greater, GreaterEqual, Equal, Less, \
+    LessEqual, Pow, NotEqual, LogicalAnd, LogicalOr
 from mo.front.extractor import FrontExtractorOp
 from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.graph.graph import Node
index 69880e6..48a30d7 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.mxnet.gather import GatherFrontReplacer
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class GatherTest(unittest.TestCase):
index 2402afb..3cc61ab 100644 (file)
  limitations under the License.
 """
 
-from mo.graph.graph import Node
 from extensions.ops.instance_normalization import InstanceNormalization
 from mo.front.extractor import FrontExtractorOp
 from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
+from mo.graph.graph import Node
 
 
 class InstanceNormFrontExtractor(FrontExtractorOp):
index feee647..05b1d9e 100644 (file)
  limitations under the License.
 """
 
+from extensions.ops.DetectionOutput import DetectionOutput
 from mo.front.extractor import FrontExtractorOp
 from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 
-from extensions.ops.DetectionOutput import DetectionOutput
 
 class MultiBoxDetectionOutputExtractor(FrontExtractorOp):
     op = '_contrib_MultiBoxDetection'
index 6be834c..05cd662 100644 (file)
@@ -23,7 +23,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.front.common.replacement import FrontReplacementOp
 from mo.front.tf.graph_utils import create_op_node_with_second_input
 from mo.graph.graph import Graph
-from mo.ops.const import Const
 from mo.ops.reshape import Reshape
 from mo.ops.shape import Shape
 from mo.ops.squeeze import Squeeze
index 10da838..ec9c36c 100644 (file)
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.ops.parameter import Parameter
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.front.extractor import FrontExtractorOp
 from mo.ops.const import Const
-from mo.ops.pad import Pad
 
 
 class NullFrontExtractor(FrontExtractorOp):
index 918cf53..1ca39f9 100644 (file)
@@ -16,8 +16,8 @@
 
 import numpy as np
 
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.pad import AttributedPad
 
 
index 1d17059..5d803fe 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.crop import Crop
 
 
index 135ccc5..094a82d 100644 (file)
 
 import numpy as np
 
-from mo.graph.graph import Graph
 from extensions.ops.elementwise import Mul
-from mo.ops.const import Const
 from mo.front.common.replacement import FrontReplacementSubgraph
+from mo.graph.graph import Graph
+from mo.ops.const import Const
 
 
 class SoftmaxFrontReplacementSubgraph(FrontReplacementSubgraph):
index 0462a3b..ab2396f 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.softmax import Softmax
 
 
index 52fa400..5fb37ee 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.softmax import Softmax
 
 
index 8a5247f..c88c9e9 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.softmax import Softmax
 
 
index c3423de..0dec2ff 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.squeeze import Squeeze
 
 
index a4c0627..8d0f061 100644 (file)
@@ -21,8 +21,8 @@ from mo.front.common.replacement import FrontReplacementSubgraph
 from mo.front.tf.graph_utils import create_op_node_with_second_input
 from mo.graph.graph import Node, Graph
 from mo.middle.pattern_match import find_pattern_matches
-from mo.ops.result import Result
 from mo.ops.reshape import Reshape
+from mo.ops.result import Result
 
 
 class SsdPatternDetectionOutputReplacer(FrontReplacementSubgraph):
index cb2485c..104b128 100644 (file)
@@ -17,8 +17,8 @@
 import unittest
 
 from extensions.front.mxnet.ssd_pattern_flatten_softmax_activation import SsdPatternFlattenSoftmaxActivation
-from mo.utils.unittest.graph import build_graph
 from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
 
 
 class TestSsdPatternFlattenSoftmaxActivation(unittest.TestCase):
index 80015eb..12061e9 100644 (file)
@@ -17,8 +17,8 @@
 import unittest
 
 from extensions.front.mxnet.ssd_pattern_remove_flatten import SsdPatternRemoveFlatten
-from mo.utils.unittest.graph import build_graph
 from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
 
 
 class TestSsdPatternRemoveFlatten(unittest.TestCase):
index bb56b94..d2a7234 100644 (file)
@@ -17,8 +17,8 @@
 import unittest
 
 from extensions.front.mxnet.ssd_pattern_remove_reshape import SsdPatternRemoveReshape
-from mo.utils.unittest.graph import build_graph
 from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
 
 
 class TestSsdPatternRemoveReshape(unittest.TestCase):
index 2eab60a..997ea99 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
-
 from extensions.front.mxnet.ssd_pattern_flatten_softmax_activation import SsdPatternFlattenSoftmaxActivation
 from extensions.front.mxnet.ssd_pattern_remove_flatten import SsdPatternRemoveFlatten
 from extensions.front.mxnet.ssd_pattern_remove_reshape import SsdPatternRemoveReshape
index 8450c02..38a9d5a 100644 (file)
@@ -17,8 +17,8 @@
 import unittest
 
 from extensions.front.mxnet.ssd_pattern_remove_transpose import SsdPatternRemoveTranspose
-from mo.utils.unittest.graph import build_graph
 from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
 
 
 class TestSsdPatternRemoveTranspose(unittest.TestCase):
index 0b3fb7a..b4a55a3 100644 (file)
  limitations under the License.
 """
 
-import networkx as nx
-
-from mo.graph.graph import Graph
-from mo.front.common.replacement import FrontReplacementPattern
-from extensions.front.mxnet.ssd_pattern_remove_transpose import SsdPatternRemoveTranspose
 from extensions.front.mxnet.ssd_pattern_flatten_softmax_activation import SsdPatternFlattenSoftmaxActivation
+from extensions.front.mxnet.ssd_pattern_remove_transpose import SsdPatternRemoveTranspose
+from mo.front.common.replacement import FrontReplacementPattern
+from mo.graph.graph import Graph
 
 
 class SsdReorderDetectionOutInputs(FrontReplacementPattern):
index d97a82a..91d7293 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.mxnet.ssd_reorder_detection_out_inputs import SsdReorderDetectionOutInputs
-from mo.utils.unittest.graph import build_graph
 from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
 
 
 class TestSsdReorderDetectionOutInputs(unittest.TestCase):
index 86bc032..afbac1f 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
+from extensions.ops.pack import PackOp
 from mo.front.extractor import FrontExtractorOp
 from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
-from extensions.ops.pack import PackOp
 
 
 class StackFrontExtractor(FrontExtractorOp):
index 06441ee..e0fc4c0 100644 (file)
  limitations under the License.
 """
 
-import ast
 import numpy as np
 
-from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.front.extractor import FrontExtractorOp
+from mo.front.mxnet.extractors.utils import get_mxnet_layer_attrs
 from mo.ops.const import Const
 
 
index 64a155f..cc131ad 100644 (file)
@@ -16,8 +16,6 @@
 
 import logging as log
 
-import networkx as nx
-
 from mo.front.common.replacement import FrontReplacementSubgraph
 from mo.graph.graph import Graph
 
index d9e5c7a..e3d874f 100644 (file)
@@ -20,8 +20,8 @@ import numpy as np
 import onnx
 
 from extensions.front.onnx.affine_ext import AffineFrontExtractor
-from mo.utils.unittest.graph import build_graph
 from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
 
 
 class AffineONNXExtractorTest(unittest.TestCase):
index 73cbe66..de3983f 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.ops.constant_fill import ConstantFill
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr
index dbe54a0..aebd258 100644 (file)
 
 import numpy as np
 
-from mo.front.common.extractors.utils import layout_attrs
+from mo.front.common.partial_infer.utils import int64_array
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr, get_onnx_autopad
 from mo.ops.convolution import Convolution
 from mo.utils.error import Error
-from mo.front.common.partial_infer.utils import int64_array
 
 
 class ConvFrontExtractor(FrontExtractorOp):
index 2477fa6..85f1c42 100644 (file)
@@ -20,9 +20,9 @@ import numpy as np
 import onnx
 
 from extensions.front.onnx.conv_ext import ConvTransposeFrontExtractor
-from mo.utils.unittest.graph import build_graph
 from mo.graph.graph import Node
 from mo.utils.error import Error
+from mo.utils.unittest.graph import build_graph
 
 
 class ConvTransposeONNXExtractorTest(unittest.TestCase):
index add2794..2fd6a8e 100644 (file)
@@ -20,8 +20,8 @@ import numpy as np
 import onnx
 
 from extensions.front.onnx.crop_ext import CropFrontExtractor
-from mo.utils.unittest.graph import build_graph
 from mo.graph.graph import Node
+from mo.utils.unittest.graph import build_graph
 
 
 class CropONNXExtractorTest(unittest.TestCase):
index 9f0ed86..94b1eaf 100644 (file)
@@ -19,8 +19,6 @@ import numpy as np
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr, get_onnx_autopad
 from mo.ops.deformable_convolution import DeformableConvolution
-from mo.utils.error import Error
-from mo.front.common.partial_infer.utils import int64_array
 
 
 class DeformableConvExtractor(FrontExtractorOp):
index dddc5ac..267e801 100644 (file)
  limitations under the License.
 """
 
-import onnx
 import unittest
 
 import numpy as np
+import onnx
 
 from extensions.front.onnx.detection_output import DetectionOutputFrontExtractor
 from extensions.ops.DetectionOutput import DetectionOutput
index 5904864..f72c80b 100644 (file)
@@ -15,6 +15,7 @@
 """
 
 from math import log
+
 import numpy as np
 
 from extensions.ops.detectionoutput_onnx import ExperimentalDetectronDetectionOutput
index 13d6112..6d23b3e 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
+from extensions.ops.identity import IdentityOp
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr
-from extensions.ops.identity import IdentityOp
 from mo.utils.error import Error
 
 
index e504fe8..349ba47 100644 (file)
@@ -17,8 +17,6 @@
 import numpy as np
 
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.op import Op
-
 from mo.front.onnx.extractors.utils import onnx_attr
 
 
index 3e6e0cd..4b1702d 100644 (file)
@@ -26,7 +26,6 @@ from mo.graph.graph import Graph
 from mo.graph.graph import Node
 from mo.ops.reshape import Reshape
 
-
 input_fpn_heads = ('486', '454', '422', '390')
 
 
diff --git a/model-optimizer/extensions/front/onnx/non_zero_ext.py b/model-optimizer/extensions/front/onnx/non_zero_ext.py
new file mode 100644 (file)
index 0000000..6f3e97b
--- /dev/null
@@ -0,0 +1,29 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+import numpy as np
+
+from extensions.ops.non_zero import NonZero
+from mo.front.extractor import FrontExtractorOp
+
+
+class NonZeroExtractor(FrontExtractorOp):
+    op = 'NonZero'
+    enabled = True
+
+    @classmethod
+    def extract(cls, node):
+        NonZero.update_node_stat(node, {'output_type': np.int64})
+        return cls.enabled
diff --git a/model-optimizer/extensions/front/onnx/person_detection_crossroad.json b/model-optimizer/extensions/front/onnx/person_detection_crossroad.json
new file mode 100644 (file)
index 0000000..8fbd555
--- /dev/null
@@ -0,0 +1,12 @@
+[
+  {
+    "custom_attributes":
+    {
+      "fpn_heads": ["634", "635", "636", "637"],
+      "ROI_feature_extractor_inputs": ["2475", "2834", "3192"],
+      "ROI_feature_extractor_outputs": ["2614", "2972", "3330"]
+    },
+    "id": "ONNXPersonDetectionCrossroadReplacement",
+    "match_kind": "general"
+  }
+]
diff --git a/model-optimizer/extensions/front/onnx/person_detection_crossroad_conversion.py b/model-optimizer/extensions/front/onnx/person_detection_crossroad_conversion.py
new file mode 100644 (file)
index 0000000..329f3dd
--- /dev/null
@@ -0,0 +1,56 @@
+"""
+ Copyright (c) 2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from extensions.ops.roifeatureextractor_onnx import ExperimentalDetectronROIFeatureExtractor
+from mo.front.common.partial_infer.utils import int64_array
+from mo.front.tf.replacement import FrontReplacementFromConfigFileGeneral
+from mo.graph.graph import Graph, Node, rename_node
+
+
+class ONNXPersonDetectionCrossroadReplacement(FrontReplacementFromConfigFileGeneral):
+    """
+    Insert ExperimentalDetectronROIFeatureExtractor layers instead of sub-graphs of the model.
+    """
+    replacement_id = 'ONNXPersonDetectionCrossroadReplacement'
+
+    def transform_graph(self, graph: Graph, replacement_descriptions: dict):
+        fpn_heads = replacement_descriptions['fpn_heads']
+        for inp, out in zip(replacement_descriptions['ROI_feature_extractor_inputs'],
+                            replacement_descriptions['ROI_feature_extractor_outputs']):
+            insert_experimental_layers(graph, fpn_heads, inp, out)
+
+
+def insert_experimental_layers(graph: Graph, input_fpn_heads: list, inp: str, out: str):
+    old_output_node = Node(graph, out)
+    output_name = old_output_node.soft_get('name', old_output_node.id)
+    old_output_node_name = output_name + '/old'
+    rename_node(old_output_node, old_output_node_name)
+
+    input_fpn_head_nodes = [Node(graph, node_id) for node_id in input_fpn_heads]
+    fpn_roi_align = ExperimentalDetectronROIFeatureExtractor(graph, {'name': output_name,
+                                                                     'distribute_rois_between_levels': 1,
+                                                                     'image_id': 0,
+                                                                     'output_size': 7,
+                                                                     'preserve_rois_order': 1,
+                                                                     'pyramid_scales': int64_array(
+                                                                         [4, 8, 16, 32, 64]),
+                                                                     'sampling_ratio': 2, }).create_node()
+    rename_node(fpn_roi_align, output_name)
+    fpn_roi_align.in_port(0).connect(Node(graph, inp).out_port(0))
+    for ind, fpn_node in enumerate(input_fpn_head_nodes):
+        fpn_roi_align.in_port(ind + 1).connect(fpn_node.out_port(0))
+
+    old_output_node.out_port(0).get_connection().set_source(fpn_roi_align.out_port(0))
index a9d1d70..b86fef2 100644 (file)
@@ -79,13 +79,17 @@ class GlobalMaxPoolFrontExtractor(FrontExtractorOp):
 
 
 def common_onnx_pool_extractor(node):
+    kernel_shape = onnx_attr(node, 'kernel_shape', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
+    final_kernel_shape = np.array([1, 1, *[x for x in kernel_shape]], dtype=np.int64) if kernel_shape is not None else None
+
     pads = onnx_attr(node, 'pads', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
 
-    # Try to convert slightly incorrect models with insufficient pad parameters
-    if pads is not None and (pads.size == 2 or pads.size % 2 != 0):
-        log.warning(
-            'Node {} has pad = {} which is ill-formed -- it should consist of N%2==0 elements.'.format(node.name,
-                                                                                                       pads))
+    if kernel_shape is not None and pads is not None and kernel_shape.size * 2 != pads.size:
+        log.warning('Node {} has pad = {} which is ill-formed -- it should have even amount of elements.'.format(
+            node.soft_get('name', node.id), pads))
+
+        # Try to convert slightly incorrect models with insufficient pad parameters
+        assert pads.size * 2 == kernel_shape.size
         pads = np.concatenate([pads, pads])
         log.warning('Extended pads to {}'.format(pads))
 
@@ -102,10 +106,8 @@ def common_onnx_pool_extractor(node):
     final_strides = np.array([1, 1, *[x for x in strides]], dtype=np.int64) if strides is not None else None
 
     dilations = onnx_attr(node, 'dilations', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
-    assert dilations is None, 'dilations attribute is not supported in node {}'.format(node.id)
-
-    kernel_shape = onnx_attr(node, 'kernel_shape', 'ints', default=None, dst_type=lambda x: np.array(x, dtype=np.int64))
-    final_kernel_shape = np.array([1, 1, *[x for x in kernel_shape]], dtype=np.int64) if kernel_shape is not None else None
+    assert dilations is None or np.all(dilations == 1),\
+        'Node {} has "dilations" attribute with values not equal to 1s which is not supported'.format(node.id)
 
     # exclude_pad = True only when count_include_pad == 0
     exclude_pad = onnx_attr(node, 'count_include_pad', 'i', default=0) == 0
index 65652cf..118b04a 100644 (file)
  limitations under the License.
 """
 
-import onnx
 import unittest
 
 import numpy as np
+import onnx
 
 from extensions.front.onnx.priorbox_clustered_ext import PriorBoxClusteredFrontExtractor
 from extensions.ops.priorbox_clustered import PriorBoxClusteredOp
index 66055a3..fdf96e5 100644 (file)
  limitations under the License.
 """
 
-import onnx
 import unittest
 
 import numpy as np
+import onnx
 
 from extensions.front.onnx.priorbox_ext import PriorBoxFrontExtractor
 from extensions.ops.priorbox import PriorBoxOp
index 63bfda6..7e60e5a 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
+from extensions.ops.fakequantize import FakeQuantize
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr
-from extensions.ops.fakequantize import FakeQuantize
 
 
 class FakeQuantizeFrontExtractor(FrontExtractorOp):
diff --git a/model-optimizer/extensions/front/onnx/reverse_sequence_ext.py b/model-optimizer/extensions/front/onnx/reverse_sequence_ext.py
new file mode 100644 (file)
index 0000000..48cf743
--- /dev/null
@@ -0,0 +1,36 @@
+"""
+ Copyright (c) 2019 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from extensions.ops.reverse_sequence import ReverseSequence
+from mo.front.extractor import FrontExtractorOp
+from mo.front.onnx.extractors.utils import onnx_attr
+
+
+class ReverseSequenceExtractor(FrontExtractorOp):
+    op = 'ReverseSequence'
+    enabled = True
+
+    @staticmethod
+    def extract(node):
+        batch_axis = onnx_attr(node, 'batch_axis', 'i', default=1)
+        time_axis = onnx_attr(node, 'time_axis', 'i', default=0)
+
+        attrs = {
+            'batch_axis': batch_axis,
+            'seq_axis': time_axis,
+        }
+        ReverseSequence.update_node_stat(node, attrs)
+        return __class__.enabled
diff --git a/model-optimizer/extensions/front/onnx/roialign_ext.py b/model-optimizer/extensions/front/onnx/roialign_ext.py
new file mode 100644 (file)
index 0000000..bcf97fc
--- /dev/null
@@ -0,0 +1,36 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from extensions.ops.roialign import ROIAlign
+from mo.front.extractor import FrontExtractorOp
+from mo.front.onnx.extractors.utils import onnx_attr
+
+
+class ROIAlignExtractor(FrontExtractorOp):
+    op = 'ROIAlign'
+    enabled = True
+
+    @classmethod
+    def extract(cls, node):
+        mode = onnx_attr(node, 'mode', 's', default=b'avg').decode()
+        output_height = onnx_attr(node, 'output_height', 'i', default=1)
+        output_width = onnx_attr(node, 'output_width', 'i', default=1)
+        sampling_ratio = onnx_attr(node, 'sampling_ratio', 'i', default=0)
+        spatial_scale = onnx_attr(node, 'spatial_scale', 'f', default=1.0)
+
+        ROIAlign.update_node_stat(node, {'pooled_h': output_height, 'pooled_w': output_width,
+                                         'sampling_ratio': sampling_ratio, 'spatial_scale': spatial_scale,
+                                         'mode': mode})
+        return cls.enabled
diff --git a/model-optimizer/extensions/front/onnx/scatter_ext.py b/model-optimizer/extensions/front/onnx/scatter_ext.py
new file mode 100644 (file)
index 0000000..8d3be1c
--- /dev/null
@@ -0,0 +1,41 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from extensions.ops.scatter import ScatterElementsUpdate
+from mo.front.extractor import FrontExtractorOp
+from mo.front.onnx.extractors.utils import onnx_attr
+
+
+class ScatterExtractor(FrontExtractorOp):
+    # deprecated ONNX operation
+    op = 'Scatter'
+    enabled = True
+
+    @classmethod
+    def extract(cls, node):
+        axis = onnx_attr(node, 'axis', 'i', default=0)
+        ScatterElementsUpdate.update_node_stat(node, {'axis': axis})
+        return cls.enabled
+
+
+class ScatterElementsExtractor(FrontExtractorOp):
+    op = 'ScatterElements'
+    enabled = True
+
+    @classmethod
+    def extract(cls, node):
+        axis = onnx_attr(node, 'axis', 'i', default=0)
+        ScatterElementsUpdate.update_node_stat(node, {'axis': axis})
+        return cls.enabled
index 9cfedd8..f517f68 100644 (file)
 
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr
-from mo.ops.softmax import Softmax
+from mo.ops.softmax import LogSoftmax, Softmax
 
 
-class SoftmaxFrontExtractor(FrontExtractorOp):
+class SoftmaxExtractor(FrontExtractorOp):
     op = 'Softmax'
     enabled = True
 
     @classmethod
     def extract(cls, node):
         axis = onnx_attr(node, 'axis', 'i', default=1)
+        Softmax.update_node_stat(node, {'axis': axis})
+        return cls.enabled
+
 
-        attrs = {
-            'axis': axis
-        }
+class LogSoftmaxExtractor(FrontExtractorOp):
+    op = 'LogSoftmax'
+    enabled = True
 
-        # update the attributes of the node
-        Softmax.update_node_stat(node, attrs)
+    @classmethod
+    def extract(cls, node):
+        axis = onnx_attr(node, 'axis', 'i', default=1)
+        LogSoftmax.update_node_stat(node, {'axis': axis})
         return cls.enabled
index b5049e3..c7ea1c7 100644 (file)
@@ -16,9 +16,9 @@
 
 import numpy as np
 
-from mo.ops.squeeze import Squeeze
 from mo.front.extractor import FrontExtractorOp
 from mo.front.onnx.extractors.utils import onnx_attr
+from mo.ops.squeeze import Squeeze
 
 
 class SqueezeFrontExtractor(FrontExtractorOp):
index a09cef1..8030a52 100644 (file)
@@ -22,8 +22,8 @@ import onnx
 from generator import generator, generate
 
 from extensions.front.onnx.transpose_ext import TransposeFrontExtractor
-from mo.ops.op import Op
 from extensions.ops.transpose import Transpose
+from mo.ops.op import Op
 from mo.utils.unittest.extractors import PB
 
 
index d99fbb3..4ba3c6f 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.reciprocal import ReciprocalReplacer
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
diff --git a/model-optimizer/extensions/front/scatter_normalizer.py b/model-optimizer/extensions/front/scatter_normalizer.py
new file mode 100644 (file)
index 0000000..07afa84
--- /dev/null
@@ -0,0 +1,42 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import numpy as np
+
+from mo.front.common.replacement import FrontReplacementPattern
+from mo.graph.graph import Graph
+from mo.ops.const import Const
+
+
+class ScatterNormalizer(FrontReplacementPattern):
+    enabled = True
+
+    def find_and_replace_pattern(self, graph: Graph):
+        for node in graph.get_op_nodes(is_scatter=True):
+            name = node.soft_get('name', node.id)
+            input_ports_count = len([port for port in node.in_ports().values() if not port.disconnected()])
+            has_axis = node.has_valid('axis')
+
+            if has_axis:
+                assert input_ports_count == 3, \
+                    '{} node {} has unexpected number of input ports {}'.format(node.op, name, input_ports_count)
+                const = Const(graph, {'name': name + '/axis', 'value': np.int64(node.axis)}).create_node()
+                node.add_input_port(3, skip_if_exist=True)
+                node.in_port(3).connect(const.out_port(0))
+                del node['axis']
+            else:
+                assert input_ports_count == 4, \
+                    '{} node {} has unexpected number of input ports {}'.format(node.op, name, input_ports_count)
diff --git a/model-optimizer/extensions/front/scatter_normalizer_test.py b/model-optimizer/extensions/front/scatter_normalizer_test.py
new file mode 100644 (file)
index 0000000..016cd99
--- /dev/null
@@ -0,0 +1,80 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import unittest
+
+import numpy as np
+
+from extensions.front.scatter_normalizer import ScatterNormalizer
+from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph, result, connect, \
+    regular_op_with_empty_data
+
+nodes = {
+    **regular_op_with_empty_data('placeholder_1', {'type': 'Parameter'}),
+    **regular_op_with_empty_data('placeholder_2', {'type': 'Parameter'}),
+    **regular_op_with_empty_data('placeholder_3', {'type': 'Parameter'}),
+    **regular_op_with_empty_data('node', {'op': 'ScatterElementsUpdate', 'is_scatter': True}),
+    **regular_op_with_empty_data('axis', {'type': 'Const', 'value': None}),
+    **result(),
+}
+
+edges = [
+    *connect('placeholder_1', '0:node'),
+    *connect('placeholder_2', '1:node'),
+    *connect('placeholder_3', '2:node'),
+    *connect('node', 'output'),
+]
+
+
+class TestDiv(unittest.TestCase):
+    def test_ScatterElementsUpdate_has_axis_and_3_inputs(self):
+        graph = build_graph(nodes, edges, {'node': {'axis': 1}}, nodes_with_edges_only=True)
+        ScatterNormalizer().find_and_replace_pattern(graph)
+
+        graph_ref = build_graph(nodes, [
+            *edges,
+            *connect('axis', '3:node'),
+        ], {'axis': {'value': np.int64(1)}}, nodes_with_edges_only=True)
+
+        (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+        self.assertTrue(flag, resp)
+
+    def test_ScatterElementsUpdate_has_axis_and_4_inputs(self):
+        graph = build_graph(nodes, [
+            *edges,
+            *connect('axis', '3:node'),
+        ], {'node': {'axis': 1}, 'axis': {'value': np.int64(1)}}, nodes_with_edges_only=True)
+        self.assertRaises(AssertionError, ScatterNormalizer().find_and_replace_pattern, graph)
+
+    def test_ScatterElementsUpdate_has_no_axis_and_3_inputs(self):
+        graph = build_graph(nodes, edges, nodes_with_edges_only=True)
+        self.assertRaises(AssertionError, ScatterNormalizer().find_and_replace_pattern, graph)
+
+    def test_ScatterElementsUpdate_has_no_axis_and_4_inputs(self):
+        graph = build_graph(nodes, [
+            *edges,
+            *connect('axis', '3:node'),
+        ], {'axis': {'value': np.int64(1)}}, nodes_with_edges_only=True)
+        ScatterNormalizer().find_and_replace_pattern(graph)
+
+        graph_ref = build_graph(nodes, [
+            *edges,
+            *connect('axis', '3:node'),
+        ], {'axis': {'value': np.int64(1)}}, nodes_with_edges_only=True)
+
+        (flag, resp) = compare_graphs(graph, graph_ref, 'output', check_op_attrs=True)
+        self.assertTrue(flag, resp)
index 5cdd165..ad3a7e4 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.softsign_replacer import SoftSign
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter', 'shape': np.array([1, 227, 227, 3])},
index 3e6d7d1..26d095c 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.squared_difference import SquaredDifference
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter', 'shape': np.array([1, 227, 227, 3])},
index ea053e7..c029096 100644 (file)
@@ -16,8 +16,6 @@
 
 import logging as log
 
-import networkx as nx
-
 from mo.front.common.replacement import FrontReplacementSubgraph
 from mo.graph.graph import Graph
 
index 02f5f2b..11d8b43 100644 (file)
@@ -16,8 +16,6 @@
 
 import logging as log
 
-import networkx as nx
-
 from mo.front.common.replacement import FrontReplacementOp
 from mo.graph.graph import Node, Graph
 from mo.utils.error import Error
index b2a41b8..9748252 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.extractor import FrontExtractorOp
 from mo.front.common.partial_infer.elemental import single_output_infer
+from mo.front.extractor import FrontExtractorOp
 
 
 class LoopCondFrontExtractor(FrontExtractorOp):
index bfab111..26d6e1e 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.front.standalone_const_eraser import StandaloneConstEraser
 from extensions.ops.DetectionOutput import DetectionOutput
 from mo.front.common.partial_infer.utils import int64_array
index 44bcb95..f46bb92 100644 (file)
@@ -15,6 +15,7 @@
 """
 
 import logging as log
+from typing import Optional
 
 from extensions.ops.elementwise import Mul
 from extensions.ops.interpolate import Interpolate
@@ -24,7 +25,6 @@ from mo.graph.graph import Graph, Node
 from mo.ops.const import Const
 from mo.ops.shape import Shape
 from mo.ops.strided_slice import StridedSlice
-from typing import Optional
 
 
 def get_concat_after_split(split: Node) -> Optional[Node]:
index 05ccae6..6eb9e5f 100644 (file)
@@ -21,9 +21,8 @@ import numpy as np
 
 from extensions.front.tf.SplitConcatPairToInterpolate import SplitConcatPairToInterpolate
 from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
-
+from mo.utils.unittest.graph import build_graph
 
 graph_node_attrs_for_2d_spatial_case = {
         'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index aea70f0..727a79c 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 from extensions.ops.select import Select
-from mo.graph.graph import Graph
 from mo.front.common.replacement import FrontReplacementSubgraph
+from mo.graph.graph import Graph
 
 
 class SwitchMergeOptimization(FrontReplacementSubgraph):
index 5d177a2..780a169 100644 (file)
@@ -17,8 +17,8 @@ import unittest
 
 from extensions.front.tf.SwitchMergeOptimization import SwitchMergeOptimization
 from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class SwitchMergeOptimizationTest(unittest.TestCase):
index 3b81887..6800574 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
-from mo.front.extractor import FrontExtractorOp
 from extensions.ops.TensorArrayGather import TensorArrayGather
-from mo.front.tf.extractors.utils import tf_int_list, tf_tensor_shape
+from mo.front.extractor import FrontExtractorOp
+from mo.front.tf.extractors.utils import tf_tensor_shape
 from mo.graph.graph import Node
 
 
index 09ed9d1..e251dfd 100644 (file)
@@ -19,7 +19,7 @@ import logging as log
 import networkx as nx
 
 from mo.front.common.replacement import FrontReplacementOp
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
 from mo.utils.error import Error
 
 
index ac3dc7b..7cea253 100644 (file)
 
 import logging as log
 
-import numpy as np
-
-from extensions.ops.bucketize import Bucketize
 from mo.front.common.replacement import FrontReplacementSubgraph
-from mo.graph.graph import Graph, Node
+from mo.graph.graph import Graph
 from mo.ops.const import Const
 
 
index 8ef4706..82ae051 100644 (file)
@@ -17,7 +17,7 @@ import numpy as np
 
 from mo.front.common.partial_infer.utils import convert_tf_padding_to_str, int64_array
 from mo.front.extractor import FrontExtractorOp
-from mo.front.tf.extractors.utils import tf_data_format_spatial, tf_data_format_channel, tf_data_format_batch, \
+from mo.front.tf.extractors.utils import tf_data_format_channel, tf_data_format_batch, \
     tf_int_list
 from mo.ops.convolution import Convolution
 from mo.ops.op import PermuteAttrs
@@ -89,7 +89,7 @@ def tf_create_attrs(node, input_feature_channel, output_feature_channel):
 
     attrs = {
         'type': 'Convolution',
-        'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding']),
+        'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding'].s.decode()),
         'bias_addable': True,
         'bias_term': False,
         'dilation': dilations,
index 16206a3..adae35f 100644 (file)
@@ -13,9 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import numpy as np
 
-from mo.front.common.partial_infer.utils import convert_tf_padding_to_str, int64_array
+from mo.front.common.partial_infer.utils import convert_deconv_tf_padding_to_str, int64_array
 from mo.front.extractor import FrontExtractorOp
 from mo.front.tf.extractors.utils import tf_data_format_spatial, tf_data_format_channel, tf_data_format_batch, \
     tf_int_list
@@ -65,7 +64,7 @@ def tf_create_attrs(node, input_feature_channel, output_feature_channel):
     data_format = node.pb.attr["data_format"]
 
     return {
-        'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding']),
+        'auto_pad': convert_deconv_tf_padding_to_str(node.pb.attr['padding'].s.decode()),
         'bias_addable': True,
         'bias_term': False,
         'spatial_dims': tf_data_format_spatial(data_format),
index fcc6343..ed63e27 100644 (file)
 """
 import logging as log
 
-import numpy as np
-
 from mo.front.common.partial_infer.utils import int64_array
 from mo.front.common.replacement import FrontReplacementOp
 from mo.front.tf.extractors.utils import tf_dtype_extractor
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
 from mo.ops.const import Const
 
 
index 8109ce9..265b54e 100644 (file)
 import unittest
 
 from extensions.front.tf.mvn_unrolled import MVNUnrolled
+from extensions.ops.mvn import MVN
 from mo.ops.op import Op
-from mo.utils.unittest.graph import build_graph_with_attrs
 from mo.utils.ir_engine.compare_graphs import compare_graphs
-from extensions.ops.mvn import MVN
+from mo.utils.unittest.graph import build_graph_with_attrs
 
 
 class MVNUnrolledMatchingTests(unittest.TestCase):
index 0500da7..028ed2f 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import numpy as np
 
 from mo.front.common.partial_infer.utils import convert_tf_padding_to_str
 from mo.front.extractor import FrontExtractorOp
-from mo.front.tf.extractors.utils import tf_data_format_spatial, tf_data_format_channel, tf_data_format_batch, \
-    tf_int_list
+from mo.front.tf.extractors.utils import tf_data_format_spatial, tf_int_list
 from mo.ops.pooling import Pooling
 
 
@@ -78,7 +76,7 @@ def create_pooling_attrs(node, pool_method):
     data_format = node.pb.attr["data_format"]
 
     attrs = {
-        'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding']),
+        'auto_pad': convert_tf_padding_to_str(node.pb.attr['padding'].s.decode()),
         'window': tf_int_list(node.pb.attr["ksize"].list),
         'spatial_dims': tf_data_format_spatial(data_format),
         'pad': None,  # will be inferred when input shape is known
index ff653d2..94c2b0f 100644 (file)
 """
 
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.softmax import Softmax
+from mo.ops.softmax import LogSoftmax, Softmax
 
 
-class SoftmaxFrontExtractor(FrontExtractorOp):
+class SoftmaxExtractor(FrontExtractorOp):
     op = 'Softmax'
     enabled = True
 
@@ -30,3 +30,17 @@ class SoftmaxFrontExtractor(FrontExtractorOp):
             axis = node.pb.attr['axis'].i
         Softmax.update_node_stat(node, {'axis': axis})
         return cls.enabled
+
+
+class LogSoftmaxExtractor(FrontExtractorOp):
+    op = 'LogSoftmax'
+    enabled = True
+
+    @classmethod
+    def extract(cls, node):
+        # the default value for the TF LogSoftmax is -1
+        axis = -1
+        if 'axis' in node.pb.attr:
+            axis = node.pb.attr['axis'].i
+        LogSoftmax.update_node_stat(node, {'axis': axis})
+        return cls.enabled
index 58f6eed..520be08 100644 (file)
@@ -13,9 +13,9 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-from extensions.ops.split import Split
 from extensions.ops.elementwise import Sub
 from extensions.ops.rank import Rank
+from extensions.ops.split import Split
 from extensions.ops.transpose import Transpose
 from mo.front.common.partial_infer.utils import int64_array
 from mo.front.common.replacement import FrontReplacementPattern
index f5044b4..efee2b4 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.extractor import FrontExtractorOp
 from extensions.ops.space_to_depth import SpaceToDepth
+from mo.front.extractor import FrontExtractorOp
 
 
 class SpaceToDepthFrontExtractor(FrontExtractorOp):
index d859573..5edff71 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.ops.sparse_fill_empty_rows import SparseFillEmptyRows
 from mo.front.extractor import FrontExtractorOp
 
index fc2440c..d2a656d 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.ops.sparse_segment_mean import SparseSegmentMean
 from mo.front.extractor import FrontExtractorOp
 
index 9b08e39..b0137d1 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.ops.sparse_segment_sqrtn import SparseSegmentSqrtN
 from mo.front.extractor import FrontExtractorOp
 
index 292b6e2..33a2e36 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.ops.sparse_segment_sum import SparseSegmentSum
 from mo.front.extractor import FrontExtractorOp
 
index a9b425e..b331775 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.ops.sparse_to_dense import SparseToDense
 from mo.front.extractor import FrontExtractorOp
 
index 1544e1f..2732212 100644 (file)
@@ -18,9 +18,7 @@ import logging as log
 
 from extensions.ops.sparse_weighted_sum import ExperimentalSparseWeightedSum
 from mo.front.common.replacement import FrontReplacementSubgraph
-from mo.graph.graph import Node, Graph
-from mo.ops.op import Op
-from mo.ops.shape import Shape
+from mo.graph.graph import Graph
 
 
 class ExperimentalSparseWeightedSumFrontReplacer(FrontReplacementSubgraph):
index 03d6abb..becbac3 100644 (file)
 
 import unittest
 
-from extensions.front.tf.sparse_weighted_sum import ExperimentalSparseWeightedSumFrontReplacer, ExperimentalSparseWeightedSumFrontReplacer2
+from extensions.front.tf.sparse_weighted_sum import ExperimentalSparseWeightedSumFrontReplacer, \
+    ExperimentalSparseWeightedSumFrontReplacer2
 from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class ExperimentalSparseWeightedSumFrontReplacersTest(unittest.TestCase):
index 31c7597..211e042 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.front.tf.swish import Swish
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'shape': np.array([1, 227, 227, 3]), 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index affd55a..7206994 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.extractor import FrontExtractorOp
 from extensions.ops.topk import TopK
+from mo.front.extractor import FrontExtractorOp
 
 
 class TopKExtractor(FrontExtractorOp):
index 7ce426d..75f1d5b 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.ops.unique import Unique
 from mo.front.extractor import FrontExtractorOp
 
index 3cc12f0..da318a7 100644 (file)
@@ -17,9 +17,9 @@ import logging as log
 
 import numpy as np
 
+from extensions.middle.ApplyNHWCtoNCHWpermutation import ApplyNHWCtoNCHWpermutation
 from extensions.middle.InsertLayoutPropagationTransposes import is_input_data_in_correct_layout, \
     is_output_data_in_correct_layout
-from extensions.middle.ApplyNHWCtoNCHWpermutation import ApplyNHWCtoNCHWpermutation
 from extensions.middle.pass_separator import PostMiddleStart
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Graph, Node
index 909357c..8d7803f 100644 (file)
@@ -13,6 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+import numpy as np
+
 from extensions.middle.EltwiseChecker import EltwiseChecker
 from extensions.ops.elementwise import Add
 from mo.front.common.layout import get_features_dim
@@ -20,7 +22,6 @@ from mo.graph.graph import Graph
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.const import Const
 from mo.ops.unsqueeze import Unsqueeze
-import numpy as np
 
 
 class BiasAddInputBroadcasting(MiddleReplacementPattern):
index 1d50efa..8705f03 100644 (file)
@@ -14,7 +14,7 @@
  limitations under the License.
 """
 
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
 from mo.middle.passes.eliminate import remove_op_node_with_data_node
 from mo.middle.replacement import MiddleReplacementPattern
 
index 5552a15..441d50d 100644 (file)
@@ -21,8 +21,8 @@ import numpy as np
 from extensions.middle.ConvertGroupedStridedSlice import ConvertGroupedStridedSlice
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index e85d352..c3493d7 100644 (file)
  limitations under the License.
 """
 
+import unittest
+
 import numpy as np
 
-import unittest
 from extensions.middle.CutInputHavingZeroDimFromConcat import CutInputHavingZeroDimFromConcat
 from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
-
+from mo.utils.unittest.graph import build_graph
 
 node_attrs_for_the_case_when_there_are_no_zero_shape_constants = {
     'const0': {
index 32ce713..f0f0901 100644 (file)
  limitations under the License.
 """
 
-import numpy as np
-
 from extensions.ops.gather import Gather
 from mo.front.common.partial_infer.utils import int64_array
+from mo.graph.graph import Graph
 from mo.middle.replacement import MiddleReplacementPattern
-from mo.ops.op import PermuteAttrs
 from mo.ops.const import Const
-from mo.graph.graph import Graph, rename_nodes
+from mo.ops.op import PermuteAttrs
 
 
 class Deconvolution3rdInputNormalization(MiddleReplacementPattern):
index 918317a..1e2bf92 100644 (file)
@@ -20,7 +20,6 @@ from mo.graph.graph import Graph, add_opoutput
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.const import Const
 from mo.ops.op import Op
-from mo.ops.reshape import Reshape
 from mo.ops.squeeze import Squeeze
 from mo.ops.unsqueeze import Unsqueeze
 
index b9c27ab..4e7ed9b 100644 (file)
 
 from typing import Dict
 
-import logging as log
-
 import numpy as np
 
+from extensions.ops.elementwise import Mul, Add
 from extensions.ops.mvn import MVN
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Graph, Node
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.const import Const
-from extensions.ops.elementwise import Mul, Add
 from mo.ops.reshape import Reshape
 from mo.ops.shape import Shape
 from mo.utils.shape import node_to_get_spatial_dimensions_value, node_to_get_features_dimension_value, \
index b70cf87..e9b7870 100644 (file)
@@ -16,8 +16,8 @@
 
 from extensions.middle.pass_separator import PostMiddleStart
 from extensions.ops.transpose import Transpose
-from mo.middle.replacement import MiddleReplacementPattern
 from mo.graph.graph import Graph, Node
+from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.const import Const
 from mo.ops.op import PermuteAttrs
 
index d670705..72adc17 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import numpy as np
 import unittest
 
+import numpy as np
+
 from extensions.middle.InsertSelect import AddSelectBeforeMemoryNodePattern
 from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class InsertSelectTests(unittest.TestCase):
index 86e0b5f..ef83275 100644 (file)
 """
 
 import logging as log
+from typing import List
 
 from extensions.ops.interpolate import Interpolate
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Graph, Node
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.const import Const
-from typing import List
 
 
 def node_has_one_consumer(node: Node) -> bool:
index a5a9bf8..6e6390c 100644 (file)
@@ -19,9 +19,8 @@ import unittest
 
 from extensions.middle.InterpolateSequenceToInterpolate import InterpolateSequenceToInterpolate
 from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
-
+from mo.utils.unittest.graph import build_graph
 
 graph_node_attrs_for_2d_case_1 = {
     'placeholder': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index 20075f2..5d130e9 100644 (file)
@@ -15,7 +15,9 @@
 """
 
 import unittest
+
 import numpy as np
+
 from extensions.middle.L2NormToNorm import L2NormToNorm
 from mo.utils.ir_engine.compare_graphs import compare_graphs
 from mo.utils.unittest.graph import build_graph_with_attrs
index f8ced45..04a759c 100644 (file)
@@ -22,7 +22,6 @@ from mo.graph.graph import Graph, add_opoutput
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.const import Const
 from mo.ops.op import Op
-from mo.ops.reshape import Reshape
 from mo.ops.squeeze import Squeeze
 from mo.ops.unsqueeze import Unsqueeze
 
index 7e38104..7802c7d 100644 (file)
@@ -15,7 +15,6 @@
 """
 import numpy as np
 
-from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Graph, Node
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.concat import Concat
index bd42e76..58a7283 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import numpy as np
 
 from mo.front.common.partial_infer.utils import int64_array
 from mo.front.tf.graph_utils import create_op_node_with_second_input
 from mo.graph.graph import Graph
 from mo.middle.replacement import MiddleReplacementPattern
-from mo.ops.const import Const
-from mo.ops.reshape import Reshape
-from mo.ops.tile import Tile
+from mo.ops.unsqueeze import Unsqueeze
 
 
 class MXTileReplacer(MiddleReplacementPattern):
     """
-        This class Reshape Tile operation if len input shape < output shape.
+        Aligns Tile operation from MxNet framework with OpenVINO Tile
+
+        MxNet has no restrictions for `tile_array` input of `Tile` operation.
+        If len(tile_array) > rank(data), this transformation will insert Unsqueeze before Tile operation,
+        because in this case output_shape > input_shape
+
+        DOC link: https://beta.mxnet.io/api/ndarray/_autogen/mxnet.ndarray.tile.html#mxnet.ndarray.tile
     """
 
     enabled = True
-    force_clean_up = True
 
     def pattern(self):
         return dict(
@@ -42,16 +44,17 @@ class MXTileReplacer(MiddleReplacementPattern):
 
     @staticmethod
     def replace_pattern(graph: Graph, match: dict):
-        mxtile = match['tile']
-
-        in_shape = mxtile.in_port(0).data.get_shape()
-        out_shape = mxtile.out_node(0).shape
-
-        tile_array_diff = (len(out_shape) - len(in_shape))
-        if tile_array_diff > 0:
-            reshape_shape = np.copy(in_shape)
-            for i in range(tile_array_diff):
-                reshape_shape = np.insert(in_shape, 0, 1, axis=0)
-            reshape_node = create_op_node_with_second_input(graph, Reshape, int64_array(reshape_shape), dict(name=mxtile.id + "/Reshape"))
-            mxtile.in_port(0).get_source().get_connection().set_destination(reshape_node.in_port(0))
-            reshape_node.out_port(0).get_connection().set_destination(mxtile.in_port(0))
+        node = match['tile']
+        name = node.soft_get('name', node.id)
+        in_shape = node.in_port(0).data.get_shape()
+        out_shape = node.out_port(0).data.get_shape()
+
+        tile_array_diff = len(out_shape) - len(in_shape)
+        if tile_array_diff == 0:
+            return
+        assert tile_array_diff > 0,\
+            'Unexpected difference between rank(input) and rank(output) for node {}'.format(name)
+        unsqueeze_dims = int64_array(range(tile_array_diff))
+        unsqueeze = create_op_node_with_second_input(graph, Unsqueeze, unsqueeze_dims,
+                                                     dict(name=name + '/Unsqueeze', override_output_shape=True))
+        node.in_port(0).get_connection().insert_node(unsqueeze)
diff --git a/model-optimizer/extensions/middle/MXTileReplacer_test.py b/model-optimizer/extensions/middle/MXTileReplacer_test.py
new file mode 100644 (file)
index 0000000..5226dfe
--- /dev/null
@@ -0,0 +1,118 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import unittest
+
+from extensions.middle.MXTileReplacer import MXTileReplacer
+from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
+
+nodes_attributes = {
+    'placeholder': {'kind': 'op', 'op': 'Parameter'},
+    'placeholder_data': {'kind': 'data'},
+    'tile': {'kind': 'op', 'op': 'Tile'},
+    'tile_data': {'kind': 'data', 'shape': int64_array([1, 1, 1, 1])},
+    'result': {'kind': 'op', 'op': 'Result'},
+
+    'unsqueeze_1': {'kind': 'op', 'op': 'Unsqueeze'},
+    'unsqueeze_1_data': {'kind': 'data'},
+    'unsqueeze_1_const': {'kind': 'op', 'op': 'Const'},
+    'unsqueeze_1_const_data': {'kind': 'data'},
+}
+
+
+class MXTileReplacerTest(unittest.TestCase):
+
+    def test_insert_one_unsqueeze(self):
+        graph = build_graph(
+            nodes_attributes,
+            [
+                ('placeholder', 'placeholder_data'),
+                ('placeholder_data', 'tile'),
+                ('tile', 'tile_data'),
+                ('tile_data', 'result')
+            ],
+            {
+                'placeholder_data': {'shape': int64_array([1, 1, 1])}
+            },
+            nodes_with_edges_only=True
+        )
+
+        ref_graph = build_graph(
+            nodes_attributes,
+            [
+                ('placeholder', 'placeholder_data'),
+                ('placeholder_data', 'unsqueeze_1', {'in': 0}),
+                ('unsqueeze_1_const', 'unsqueeze_1_const_data'),
+                ('unsqueeze_1_const_data', 'unsqueeze_1', {'in': 1}),
+                ('unsqueeze_1', 'unsqueeze_1_data'),
+                ('unsqueeze_1_data', 'tile'),
+                ('tile', 'tile_data'),
+                ('tile_data', 'result')
+            ],
+            {
+                'placeholder_data': {'shape': int64_array([1, 1, 1])},
+                'unsqueeze_1_const_data': {'value': int64_array([0])}
+            },
+            nodes_with_edges_only=True
+        )
+
+        MXTileReplacer().find_and_replace_pattern(graph)
+        graph.clean_up()
+
+        (flag, resp) = compare_graphs(graph, ref_graph, 'placeholder', check_op_attrs=True)
+        self.assertTrue(flag, resp)
+
+    def test_insert_two_unsqueezes(self):
+        graph = build_graph(
+            nodes_attributes,
+            [
+                ('placeholder', 'placeholder_data'),
+                ('placeholder_data', 'tile'),
+                ('tile', 'tile_data'),
+                ('tile_data', 'result')
+            ],
+            {
+                'placeholder_data': {'shape': int64_array([1, 1])}
+            },
+            nodes_with_edges_only=True
+        )
+
+        ref_graph = build_graph(
+            nodes_attributes,
+            [
+                ('placeholder', 'placeholder_data'),
+                ('placeholder_data', 'unsqueeze_1', {'in': 0}),
+                ('unsqueeze_1_const', 'unsqueeze_1_const_data'),
+                ('unsqueeze_1_const_data', 'unsqueeze_1', {'in': 1}),
+                ('unsqueeze_1', 'unsqueeze_1_data'),
+                ('unsqueeze_1_data', 'tile'),
+                ('tile', 'tile_data'),
+                ('tile_data', 'result')
+            ],
+            {
+                'placeholder_data': {'shape': int64_array([1, 1])},
+                'unsqueeze_1_const_data': {'value': int64_array([0, 1])}
+            },
+            nodes_with_edges_only=True
+        )
+
+        MXTileReplacer().find_and_replace_pattern(graph)
+        graph.clean_up()
+
+        (flag, resp) = compare_graphs(graph, ref_graph, 'placeholder', check_op_attrs=True)
+        self.assertTrue(flag, resp)
index c76ff48..52f16f7 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.middle.MinimumMiddleReplacer import MinimumMiddleReplacer
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index 91b60fe..ca89b4c 100644 (file)
@@ -16,6 +16,7 @@
 
 import logging as log
 from typing import Dict
+
 import numpy as np
 
 from extensions.middle.BinarizeWeightsM1P1 import BinarizeWeightsM1P1
index 2a8960b..0af60ef 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.middle.ReluQuantizeFuse import ReluQuantizeFuse, ReluFakeQuantizeMark
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes = {
     # input
index c263e68..9b8de7c 100644 (file)
@@ -16,8 +16,8 @@
 import unittest
 
 from extensions.middle.RemoveDuplicationMemory import RemoveMemoryDuplicationPattern, MergeNeighborSplicePattern
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class RemoveMemoryDuplicationPatternTests(unittest.TestCase):
index 15f5ecf..7f91a7e 100644 (file)
@@ -17,8 +17,8 @@ import unittest
 
 from extensions.middle.RemoveUselessConcatSplit import RemoveUselessConcatSplitPattern
 from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class RemoveUselessConcatSplitTests(unittest.TestCase):
index b43832b..54554fc 100644 (file)
@@ -16,8 +16,8 @@
 import unittest
 
 from extensions.middle.RemoveUselessCrops import RemoveUselessCropsPattern
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class RemoveUselessCropsPatternTests(unittest.TestCase):
index 89d341f..c00bd53 100644 (file)
@@ -17,8 +17,8 @@ import unittest
 
 from extensions.middle.ReplaceMemoryOffsetWithSplice import ReplaceMemoryOffsetNodePattern
 from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class ReplaceMemoryOffsetNodePatternTests(unittest.TestCase):
index 9681918..4bb01de 100644 (file)
@@ -16,8 +16,8 @@
 import unittest
 
 from extensions.middle.ReplacePNorm import ReplacePNormNodePattern
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class ReplacePNormNodePatternTests(unittest.TestCase):
index 99cd084..f892c52 100644 (file)
@@ -17,8 +17,8 @@ import unittest
 
 from extensions.middle.ReplaceSpliceNodePattern import ReplaceSpliceNodePattern
 from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class ReplaceSpliceNodePatternTests(unittest.TestCase):
index 4a42caa..1148843 100644 (file)
@@ -19,8 +19,8 @@ from argparse import Namespace
 import numpy as np
 
 from extensions.middle.ScaleInput import ScaleInput
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'},
                     'node_1_data': {'value': None, 'kind': 'data', 'data_type': None},
index 6b88d51..0be30a4 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from extensions.middle.SharedWeightsDuplication import SharedWeightsDuplication
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'const': {'shape': None, 'type': 'Const', 'kind': 'op', 'op': 'Const'},
index 2f38cd1..ba6e7e1 100644 (file)
@@ -17,7 +17,7 @@
 import numpy as np
 
 from mo.front.common.partial_infer.utils import int64_array
-from mo.graph.graph import Graph, Node, rename_node, rename_nodes
+from mo.graph.graph import Graph, Node, rename_nodes
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.ops.const import Const
 from mo.ops.crop import Crop
index 9998e5a..d3397fc 100644 (file)
@@ -47,7 +47,6 @@ class BackEdgesMatching(MiddleReplacementPattern):
     graph_condition = [lambda graph: graph.graph['is_cyclic']]
 
     def run_after(self):
-        from extensions.middle.TensorIteratorCondition import SimpleConditionMatcher
         return [DynamicDecoderConditionMatcher]
 
     def run_before(self):
index 85fc190..9abd5d2 100644 (file)
@@ -17,8 +17,8 @@
 import unittest
 
 from extensions.middle.TensorIteratorBackEdge import BackEdgesMatching
-from mo.utils.unittest.graph import build_graph_with_attrs
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
 
 
 class BackEdgesMatchingTests(unittest.TestCase):
index dc8ac92..81838e2 100644 (file)
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 
 from extensions.middle.TensorIteratorCondition import LoopConditionMatcher
-from mo.utils.unittest.graph import build_graph_with_attrs
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
 
 
 class TensorIteratorConditionTests(unittest.TestCase):
index ae66ecf..2279ae8 100644 (file)
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 
 from extensions.middle.TensorIteratorInput import SmartInputMatcher, SimpleInputMatcher, BackEdgeSimpleInputMatcher
-from mo.utils.unittest.graph import build_graph_with_attrs
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
 
 
 class SmartInputMatcherTests(unittest.TestCase):
index 0096d5b..d2f7f41 100644 (file)
  limitations under the License.
 """
 
+from extensions.middle.ONNXRNNSequenceNormalize import ONNXRNNSequenceNormalize
 from extensions.middle.TF_lstm_cell_to_generic import TensorFlowLSTMtoGeneric
 from extensions.middle.TensorIteratorMerge import TensorIteratorMerge
 from mo.graph.graph import Graph
 from mo.middle.pattern_match import find_isomorphisms
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.utils.error import Error
-from extensions.middle.ONNXRNNSequenceNormalize import ONNXRNNSequenceNormalize
 
 
 class TensorIteratorLSTM(MiddleReplacementPattern):
index 9181015..d62d0dd 100644 (file)
@@ -15,9 +15,9 @@
 """
 
 from collections import deque
+from copy import deepcopy
 
 import numpy as np
-from copy import deepcopy
 
 from extensions.ops.tensor_iterator import TensorIterator
 from mo.graph.graph import Node, Graph, add_opoutput
index a800eda..470a76c 100644 (file)
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 
 from extensions.middle.TensorIteratorOutput import SmartOutputMatcher
-from mo.utils.unittest.graph import build_graph_with_attrs
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
 
 
 class SmartOutputMatcherTests(unittest.TestCase):
index 82e18aa..0d063ba 100644 (file)
@@ -20,8 +20,8 @@ import numpy as np
 
 from extensions.middle.quantize_fuses import FakeQuantizeFuse
 from mo.front.common.partial_infer.eltwise import eltwise_infer
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes = {
     'placeholder': {'kind': 'op', 'op': 'Placeholder'},
index 847401f..eeaa381 100644 (file)
@@ -16,8 +16,7 @@
 
 import numpy as np
 
-from extensions.ops.identity import IdentityOp
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
 from mo.middle.passes.eliminate import merge_data_nodes
 from mo.middle.replacement import MiddleReplacementPattern
 from mo.utils.error import Error
index 47d0a00..d99c202 100644 (file)
@@ -17,8 +17,8 @@ import unittest
 
 from extensions.middle.sparse_reshape import SparseReshapeMiddleReplacer
 from mo.front.common.partial_infer.utils import int64_array
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class SparseReshapeMiddleReplacerTests(unittest.TestCase):
index 4609349..bfd753a 100644 (file)
 """
 import unittest
 
-import numpy as np
-
-from extensions.middle.UselessStridedSlice import UselessStridedSliceEraser
 from extensions.middle.wights_permute_normalizer import WeightsPermuteNormalizer
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
-from mo.utils.ir_engine.compare_graphs import compare_graphs
 
 nodes_attributes = {
     'placeholder': {'type': 'Placeholder', 'kind': 'op', 'op': 'Placeholder'},
index eaa928a..4256ad5 100644 (file)
  limitations under the License.
 """
 
-import networkx as nx
-
 from mo.front.common.partial_infer.utils import mark_input_bins
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
-import numpy as np
 
 
 class BlockLSTM(Op):
index edef1bc..167c6b2 100644 (file)
  limitations under the License.
 """
 
-
-import networkx as nx
 import numpy as np
+
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
-from mo.utils.error import Error
 
 
 class Enter(Op):
index 0d5798d..cd2a48f 100644 (file)
@@ -15,9 +15,9 @@
 """
 
 import numpy as np
+
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
-from mo.utils.error import Error
 
 
 class Exit(Op):
index 47ac286..e943e7d 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
+import numpy as np
+
 from extensions.ops.RNN import rnn_infer
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
-import numpy as np
 
 
 class GRU(Op):
index 1f5c47d..e0a98b5 100644 (file)
@@ -24,7 +24,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
-
 graph_nodes_attrs = {
     'A': {'type': 'Const', 'op': 'Const', 'kind': 'op', 'shape': None, 'value': None},
     'A_data': {'kind': 'data', 'shape': None, 'value': None},
index 9c048fb..eddc0a8 100644 (file)
@@ -14,7 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
index 3071d2f..d16b2f4 100644 (file)
@@ -14,7 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
index ef752ca..fc2564b 100644 (file)
@@ -14,9 +14,6 @@
  limitations under the License.
 """
 
-
-import networkx as nx
-
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
 
index 73cec11..1d22a6f 100644 (file)
  limitations under the License.
 """
 
+from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Graph, Node
 from mo.ops.op import Op
 from mo.ops.pooling import Pooling
-from mo.front.common.partial_infer.utils import int64_array
 
 
 class AdaptiveAvgPooling(Op):
index 030fd9f..52c1a93 100644 (file)
@@ -15,6 +15,7 @@
 """
 
 import logging as log
+
 import numpy as np
 
 from mo.front.caffe.extractors.utils import get_canonical_axis_index
index d1d3916..3f626cf 100644 (file)
@@ -16,7 +16,6 @@
 
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
-from mo.utils.error import Error
 
 
 class Assert(Op):
index 45b8d45..f1c68f9 100644 (file)
@@ -14,8 +14,7 @@
  limitations under the License.
 """
 
-from mo.front.common.partial_infer.utils import int64_array
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
 from mo.ops.op import Op
 
 
index 4773383..9255a73 100644 (file)
@@ -19,6 +19,7 @@ import numpy as np
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
 
+
 class Bucketize(Op):
     op = 'Bucketize'
 
index d165e33..fbc141a 100644 (file)
@@ -23,7 +23,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
-
 nodes_attributes = {'input_tensor': {'shape': None, 'value': None, 'kind': 'data'},
                     'input_buckets': {'shape': None, 'value': None, 'kind': 'data'},
                     'bucketize_node': {'op': 'Bucketize', 'kind': 'op', 'with_right_bound': False},
index b5dc642..8676c1d 100644 (file)
@@ -18,8 +18,6 @@
 #                axis - dimension number for tensors concatenation
 import copy
 
-import networkx as nx
-
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
 
index 3f91e95..a559f08 100644 (file)
@@ -16,6 +16,7 @@
 
 import numpy as np
 
+from mo.front.common.layout import shape_for_layout, get_height_dim, get_batch_dim, get_features_dim, get_width_dim
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
@@ -51,13 +52,25 @@ class DepthToSpaceOp(Op):
         if in_shape.size != 4:
             raise Error('TensorFlow DepthToSpace operation is supported for 4D \'NHWC\' input layout only. '
                         'Current input shape is \'{}\''.format(in_shape))
-        N, H, W, C = in_shape
+
+        layout = node.graph.graph['layout']
+
+        N = in_shape[get_batch_dim(layout, 4)]
+        H = in_shape[get_height_dim(layout, 4)]
+        W = in_shape[get_width_dim(layout, 4)]
+        C = in_shape[get_features_dim(layout, 4)]
+
         block_size = node['block_size']
         if C % (block_size ** 2):
             raise Error('Feature dimensions of input tensor of DepthToSpace operation have to be divisible by square '
                         'of DepthToSpace \'block_size\' parameter. Input tensor shape = {}. Feature dimension = {}. '
                         'block_size = {}'.format(in_shape, C, block_size))
-        out_shape = [N, int(H * block_size), int(W * block_size), int(C / (block_size ** 2))]
-        if np.prod(in_shape) != np.prod(out_shape):
-            return
+
+        out_shape = shape_for_layout(layout,
+                                     batch=N,
+                                     features=int(C / (block_size ** 2)),
+                                     height=int(H * block_size),
+                                     width=int(W * block_size))
+
+        assert np.prod(in_shape) == np.prod(out_shape)
         node.out_node().shape = int64_array(out_shape)
index 2d995c2..e8e558c 100644 (file)
@@ -15,9 +15,7 @@
 """
 
 import unittest
-
 import numpy as np
-
 from extensions.ops.depth_to_space import DepthToSpaceOp
 from mo.graph.graph import Node
 from mo.utils.error import Error
@@ -36,22 +34,43 @@ edges = [
 
 
 class TestDepthToSpacePartialInfer(unittest.TestCase):
-    def test_tf_depth_to_space_infer(self):
+    def test_tf_depth_to_space_infer_nhwc(self):
         graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NHWC'
         dts_node = Node(graph, 'DtS')
         DepthToSpaceOp.infer(dts_node)
         exp_shape = np.array([1, 2048, 1152, 64])
         res_shape = graph.node['out_data_node']['shape']
         self.assertTrue(np.array_equal(exp_shape, res_shape))
 
+    def test_tf_depth_to_space_infer_nchw(self):
+        graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NCHW'
+        graph.node['in_data_node']['shape'] = np.array([1, 256, 1024, 576])
+        dts_node = Node(graph, 'DtS')
+        DepthToSpaceOp.infer(dts_node)
+        exp_shape = np.array([1, 64, 2048, 1152])
+        res_shape = graph.node['out_data_node']['shape']
+        self.assertTrue(np.array_equal(exp_shape, res_shape))
+
     def test_tf_depth_to_space_infer_error(self):
         graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NHWC'
         graph.node['in_data_node']['shape'] = np.array([1024, 576, 256])
         dts_node = Node(graph, 'DtS')
         self.assertRaises(Error, DepthToSpaceOp.infer, dts_node)
 
-    def test_tf_depth_to_space_infer_error_1(self):
+    def test_tf_depth_to_space_infer_divisibility_error_1(self):
         graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NHWC'
         graph.node['in_data_node']['shape'] = np.array([1, 1024, 576, 255])
         dts_node = Node(graph, 'DtS')
         self.assertRaises(Error, DepthToSpaceOp.infer, dts_node)
+
+    def test_tf_depth_to_space_infer_divisibility_error_2(self):
+        graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NCHW'
+        graph.node['in_data_node']['shape'] = np.array([1, 255, 1024, 576])
+        dts_node = Node(graph, 'DtS')
+        self.assertRaises(Error, DepthToSpaceOp.infer, dts_node)
+
index b72bedc..31b53d0 100644 (file)
@@ -19,12 +19,11 @@ import unittest
 import numpy as np
 from generator import generator, generate
 
-from extensions.ops.elementwise import Div, Elementwise
+from extensions.ops.elementwise import Div
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
-
 graph_nodes_attrs = {
     'A': {'type': 'Const', 'op': 'Const', 'kind': 'op', 'shape': None, 'value': None},
     'A_data': {'kind': 'data', 'shape': None, 'value': None},
index 8cf32a7..c04beb9 100644 (file)
  limitations under the License.
 """
 
-import logging as log
-import networkx as nx
 import numpy as np
 
-from mo.front.caffe.extractors.utils import get_canonical_axis_index
 from mo.graph.graph import Node, Graph
-from mo.ops.op import Op, PermuteAttrs
+from mo.ops.op import Op
 
 
 class ExpOp(Op):
index 8c73485..8f20683 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
-
 from mo.front.common.partial_infer.elemental import copy_shape_infer
 from mo.graph.graph import Graph
 from mo.ops.op import Op
index 6f6d315..03b2ae8 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
-
 from mo.graph.graph import Graph
 from mo.ops.op import Op
 
index 3fc19cc..233d564 100644 (file)
@@ -16,9 +16,8 @@
 
 import unittest
 
-import networkx as nx
-from mo.graph.graph import Graph
 from extensions.ops.instance_normalization import InstanceNormalization
+from mo.graph.graph import Graph
 
 
 class InstanceNormalizationOp(unittest.TestCase):
index d1b6d18..aaae4c2 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
-
 from mo.front.common.partial_infer.utils import mark_input_bins
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
index df52d25..9def594 100644 (file)
 """
 
 import unittest
+
 import numpy as np
 
 from extensions.ops.merge import Merge
 from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph_with_attrs
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
 
 
 class TestMerge(unittest.TestCase):
index c149819..9310c32 100644 (file)
  limitations under the License.
 """
 
-import numpy as np
-
-from mo.front.caffe.extractors.utils import get_canonical_axis_index
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
 from mo.ops.op import Op
 
 
index 760da10..82b512d 100644 (file)
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import numpy as np
 import logging as log
 
+import numpy as np
+
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
index 59f366d..151e896 100644 (file)
@@ -1,5 +1,5 @@
 """
- Copyright (C) 2017-2020 Intel Corporation
+ Copyright (C) 2018-2020 Intel Corporation
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -18,30 +18,54 @@ import numpy as np
 
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node, Graph
+from mo.middle.passes.convert_data_type import np_data_type_to_destination_type
 from mo.ops.op import Op
 
 
 class NonZero(Op):
     op = 'NonZero'
+    enabled = False
 
     def __init__(self, graph: Graph, attrs: dict):
+        assert 'output_type' in attrs, 'NonZero has mandatory `output_type` attribute'
         mandatory_props = {
-            'type': None,
-            'op': __class__.op,
-            'infer': NonZero.infer,
+            'op': self.op,
+            'type': self.op,
+            'version': 'opset3',
+
+            'infer': self.infer,
+            'type_infer': self.type_infer,
+
             'in_ports_count': 1,
             'out_ports_count': 1,
         }
         super().__init__(graph, mandatory_props, attrs)
 
+    def backend_attrs(self):
+        return [
+            ('output_type', lambda node: np_data_type_to_destination_type(node.output_type)),
+        ]
+
     @staticmethod
     def infer(node: Node):
-        input_shape = node.in_node(0).shape
-        if input_shape is None:
-            return
-        input_value = node.in_node(0).value
+        node_name = node.soft_get('name', node.id)
+        input_shape = node.in_port(0).data.get_shape()
+        assert input_shape is not None, 'The input shape for node "{}" is None'.format(node_name)
+        assert node.has_valid('output_type'), \
+            '`output_type` attribute is not set for NonZero node `{}`'.format(node_name)
+        assert node.output_type in [np.int64, np.int32], \
+            'NonZero `output_type` attribute must be int32 or int64, `{}` found'.format(np.dtype(node.output_type).name)
+
+        input_value = node.in_port(0).data.get_value()
         if input_value is not None:
-            node.out_port(0).data.set_value(np.array(np.nonzero(input_value)))
+            node.out_port(0).data.set_value(np.array(np.nonzero(input_value), dtype=node.output_type))
         else:
-            node.out_port(0).data.set_shape(int64_array([len(input_shape), *input_shape]))
+            # output shape of NonZero should be [input_rank, dynamic]
+            # having restriction to save IR with static shape only we count upper-bound shape value here
+            node.out_port(0).data.set_shape(int64_array([len(input_shape), np.prod(input_shape)]))
 
+    @staticmethod
+    def type_infer(node):
+        assert node.output_type in [np.int64, np.int32], \
+            'NonZero `output_type` attribute must be int32 or int64, `{}` found'.format(np.dtype(node.output_type).name)
+        node.out_port(0).set_data_type(node.output_type)
index 4d9c075..429a54c 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.common.partial_infer.utils import mark_input_bins
 from mo.front.common.partial_infer.elemental import copy_shape_infer
+from mo.front.common.partial_infer.utils import mark_input_bins
 from mo.graph.graph import Graph, Node
 from mo.ops.op import Op
 from mo.utils.utils import convert_param_type
index d030894..9bad21e 100644 (file)
  limitations under the License.
 """
 
-import numpy as np
-import networkx as nx
-
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Graph
 from mo.ops.op import Op
 
 
index bbf4831..5a8e46b 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
-
 from mo.front.common.partial_infer.elemental import copy_shape_infer
 from mo.graph.graph import Graph
 from mo.ops.op import Op
index 80331ad..80b8968 100644 (file)
@@ -14,7 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
index a151f67..4ebe599 100644 (file)
@@ -17,9 +17,9 @@
 import numpy as np
 
 from mo.front.common.partial_infer.elemental import copy_shape_infer
+from mo.front.common.partial_infer.utils import mark_input_bins
 from mo.graph.graph import Graph
 from mo.ops.op import Op
-from mo.front.common.partial_infer.utils import mark_input_bins
 
 
 class PreluOp(Op):
index fecf1cb..6d1ee64 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
-
 from extensions.ops.proposal import ProposalOp
 from mo.front.caffe.extractor import register_caffe_python_extractor
 from mo.graph.graph import Graph
index 6c6fb13..58f2635 100644 (file)
@@ -14,7 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
diff --git a/model-optimizer/extensions/ops/roialign.py b/model-optimizer/extensions/ops/roialign.py
new file mode 100644 (file)
index 0000000..4d9ea71
--- /dev/null
@@ -0,0 +1,82 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from mo.front.common.layout import get_features_dim, shape_for_layout
+from mo.graph.graph import Graph
+from mo.ops.op import Op
+
+
+class ROIAlign(Op):
+    op = 'ROIAlign'
+    enabled = False
+
+    def __init__(self, graph: Graph, attrs: dict):
+        assert 'mode' in attrs, '`mode` attribute is not set for ROIAlign during creation'
+        assert 'pooled_h' in attrs, '`pooled_h` attribute is not set for ROIAlign during creation'
+        assert 'pooled_w' in attrs, '`pooled_w` attribute is not set for ROIAlign during creation'
+        assert 'sampling_ratio' in attrs, '`sampling_ratio` attribute is not set for ROIAlign during creation'
+        assert 'spatial_scale' in attrs, '`spatial_scale` attribute is not set for ROIAlign during creation'
+
+        super().__init__(graph, {
+            'op': self.op,
+            'type': self.op,
+            'version': 'opset3',
+
+            'infer': self.infer,
+
+            'in_ports_count': 3,
+            'out_ports_count': 1,
+        }, attrs)
+
+    def backend_attrs(self):
+        return [
+            ('mode', lambda node: str(node.mode)),
+            ('pooled_h', lambda node: str(int(node.pooled_h))),
+            ('pooled_w', lambda node: str(int(node.pooled_w))),
+            ('sampling_ratio', lambda node: str(int(node.sampling_ratio))),
+            ('spatial_scale', lambda node: str(float(node.spatial_scale))),
+        ]
+
+    @staticmethod
+    def infer(node):
+        layout = node.graph.graph['layout']
+        node_name = node.soft_get('name', node.id)
+
+        assert len([port for port in node.in_ports().values() if not port.disconnected()]) == 3, \
+            'The node "{}" must 3 inputs'.format(node_name)
+
+        assert node.has_valid('pooled_w'), '"pooled_w" attribute is not set for node "{}"'.format(node_name)
+        assert node.has_valid('pooled_h'), '"pooled_h" attribute is not set for node "{}"'.format(node_name)
+        assert node.has_valid('mode'), '"mode" attribute is not set for node "{}"'.format(node_name)
+        assert node.mode in ['avg', 'max'], \
+            '"mode" attribute range of values is ["avg", "max"], got {} for node "{}"'.format(node.mode, node_name)
+
+        input_shape = node.in_port(0).data.get_shape()
+        rois_shape = node.in_port(1).data.get_shape()
+        indices_shape = node.in_port(2).data.get_shape()
+        assert input_shape is not None and rois_shape is not None and indices_shape is not None, \
+            'The node "{}" input shape is None'.format(node_name)
+        assert rois_shape[0] == indices_shape[0], 'The number of batch indices does not correspond to number of ROIs ' \
+                                                  'for node "{}"'.format(node_name)
+        assert rois_shape[1] == 4, 'The size of ROI element must be 4 for node "{}"'.format(node_name)
+        assert len(input_shape) == 4, 'The rank of port 0 input tensor of node "{}" must be 4.'.format(node_name)
+
+        node.out_port(0).data.set_shape(
+            shape_for_layout(layout,
+                             batch=rois_shape[0],
+                             features=input_shape[get_features_dim(layout, 4)],
+                             height=node.pooled_h,
+                             width=node.pooled_w)
+        )
diff --git a/model-optimizer/extensions/ops/scatter.py b/model-optimizer/extensions/ops/scatter.py
new file mode 100644 (file)
index 0000000..a800ac9
--- /dev/null
@@ -0,0 +1,136 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.graph.graph import Node, Graph
+from mo.ops.op import Op
+
+
+class Scatter(Op):
+    enabled = False
+
+    op = op_type = None
+    version = None
+
+    def __init__(self, graph: Graph, attrs: dict):
+        assert self.op is not None and self.op_type is not None and self.version is not None, \
+            'Please use specialized Scatter operation class, Scatter is base class'
+
+        mandatory_props = {
+            'op': self.op,
+            'type': self.op_type,
+            'version': self.version,
+
+            'is_scatter': True,  # is used for gathering all types of scatters in common transformations
+            'infer': self.infer,
+
+            'in_ports_count': 4,
+            'out_ports_count': 1,
+        }
+        super().__init__(graph, mandatory_props, attrs)
+
+    @staticmethod
+    def infer(node: Node):
+        node_name = node.soft_get('name', node.id)
+
+        input_shape = node.in_port(0).data.get_shape()
+        indices_shape = node.in_port(1).data.get_shape()
+        updates_shape = node.in_port(2).data.get_shape()
+        assert input_shape is not None and updates_shape is not None and indices_shape is not None, \
+            'The node "{}" input shape is None'.format(node_name)
+
+        node.out_port(0).data.set_shape(input_shape)
+
+
+class ScatterElementsAdd(Scatter):
+    op = 'ScatterElementsAdd'
+    op_type = None
+    version = None
+
+
+class ScatterElementsDiv(Scatter):
+    op = 'ScatterElementsDiv'
+    op_type = None
+    version = None
+
+
+class ScatterElementsMax(Scatter):
+    op = 'ScatterElementsMax'
+    op_type = None
+    version = None
+
+
+class ScatterElementsMin(Scatter):
+    op = 'ScatterElementsMin'
+    op_type = None
+    version = None
+
+
+class ScatterElementsMul(Scatter):
+    op = 'ScatterElementsMul'
+    op_type = None
+    version = 'opset3'
+
+
+class ScatterElementsSub(Scatter):
+    op = 'ScatterElementsSub'
+    op_type = None
+    version = None
+
+
+class ScatterElementsUpdate(Scatter):
+    op = op_type = 'ScatterElementsUpdate'
+    version = 'opset3'
+
+
+class ScatterAdd(Scatter):
+    op = 'ScatterAdd'
+    op_type = None
+    version = None
+
+
+class ScatterDiv(Scatter):
+    op = 'ScatterDiv'
+    op_type = None
+    version = None
+
+
+class ScatterMax(Scatter):
+    op = 'ScatterMax'
+    op_type = None
+    version = None
+
+
+class ScatterMin(Scatter):
+    op = 'ScatterMin'
+    op_type = None
+    version = None
+
+
+class ScatterMul(Scatter):
+    op = 'ScatterMul'
+    op_type = None
+    version = None
+
+
+class ScatterSub(Scatter):
+    op = 'ScatterSub'
+    op_type = None
+    version = None
+
+
+class ScatterUpdate(Scatter):
+    op = op_type = 'ScatterUpdate'
+    version = 'opset3'
index 212d979..6d8d075 100644 (file)
 """
 
 import unittest
+
 import numpy as np
 
 from extensions.ops.select import Select
 from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph_with_attrs
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_attrs
 
 
 class TestSelect(unittest.TestCase):
index 6af7c75..af7490a 100644 (file)
@@ -16,7 +16,6 @@
 
 import logging as log
 
-import networkx as nx
 import numpy as np
 
 from mo.front.extractor import attr_getter
index a2296ee..f91d365 100644 (file)
  limitations under the License.
 """
 
-import logging as log
-
 import numpy as np
 
+from mo.front.common.layout import shape_for_layout, get_height_dim, get_batch_dim, get_features_dim, get_width_dim
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
+from mo.utils.error import Error
 
 
 class SpaceToDepth(Op):
@@ -50,16 +50,26 @@ class SpaceToDepth(Op):
     def infer(node: Node):
         in_shape = node.in_node().shape
         if in_shape.size != 4:
-            log.error('TensorFlow SpaceToDepth operation is supported for 4D \'NHWC\' input layout only. '
-                      'Current input shape is \'{}\''.format(in_shape))
-            return
-        N, H, W, C = in_shape
+            raise Error('TensorFlow SpaceToDepth operation is supported for 4D \'NHWC\' input layout only. '
+                        'Current input shape is \'{}\''.format(in_shape))
+
+        layout = node.graph.graph['layout']
+        N = in_shape[get_batch_dim(layout, 4)]
+        H = in_shape[get_height_dim(layout, 4)]
+        W = in_shape[get_width_dim(layout, 4)]
+        C = in_shape[get_features_dim(layout, 4)]
+
         block_size = node['block_size']
         if H % block_size or W % block_size:
-            log.error('Spatial dimensions of input tensor of SpaceToDepth operation have to be divisible by '
-                      'SpaceToDepth \'block_size\' parameter. Input tensor shape = {}. Spatial dimensions = {},{}. '
-                      'block_size = {}'.format(in_shape, H, W, block_size))
-            return
-        out_shape = [N, int(H / block_size), int(W / block_size), int(C * (block_size ** 2))]
+            raise Error('Spatial dimensions of input tensor of SpaceToDepth operation have to be divisible by '
+                        'SpaceToDepth \'block_size\' parameter. Input tensor shape = {}. Spatial dimensions = {},{}. '
+                        'block_size = {}'.format(in_shape, H, W, block_size))
+
+        out_shape = shape_for_layout(layout,
+                                     batch=N,
+                                     features=int(C * (block_size ** 2)),
+                                     height=int(H / block_size),
+                                     width=int(W / block_size))
+
         assert np.prod(in_shape) == np.prod(out_shape)
         node.out_node().shape = int64_array(out_shape)
diff --git a/model-optimizer/extensions/ops/space_to_depth_test.py b/model-optimizer/extensions/ops/space_to_depth_test.py
new file mode 100644 (file)
index 0000000..61f3a21
--- /dev/null
@@ -0,0 +1,74 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import unittest
+import numpy as np
+from extensions.ops.space_to_depth import SpaceToDepth
+from mo.graph.graph import Node
+from mo.utils.error import Error
+from mo.utils.unittest.graph import build_graph
+
+nodes = {
+    'in_data_node': {'value': None, 'kind': 'data', 'shape': np.array([1, 2048, 1152, 64])},
+    'StD': {'op': 'SpaceToDepth', 'kind': 'op', 'block_size': 2},
+    'out_data_node': {'value': None, 'kind': 'data', 'shape': None}
+}
+
+edges = [
+    ('in_data_node', 'StD'),
+    ('StD', 'out_data_node')
+]
+
+class TestSpaceToDepthPartialInfer(unittest.TestCase):
+    def test_tf_space_to_depth_infer_nhwc(self):
+        graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NHWC'
+        std_node = Node(graph, 'StD')
+        SpaceToDepth.infer(std_node)
+        exp_shape = np.array([1, 1024, 576, 256])
+        res_shape = graph.node['out_data_node']['shape']
+        self.assertTrue(np.array_equal(exp_shape, res_shape))
+
+    def test_tf_space_to_depth_infer_nchw(self):
+        graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NCHW'
+        graph.node['in_data_node']['shape'] = np.array([1, 64, 2048, 1152])
+        std_node = Node(graph, 'StD')
+        SpaceToDepth.infer(std_node)
+        exp_shape = np.array([1, 256, 1024, 576])
+        res_shape = graph.node['out_data_node']['shape']
+        self.assertTrue(np.array_equal(exp_shape, res_shape))
+
+    def test_tf_space_to_depth_infer_shape_error(self):
+        graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NHWC'
+        graph.node['in_data_node']['shape'] = np.array([1024, 576, 256])
+        std_node = Node(graph, 'StD')
+        self.assertRaises(Error, SpaceToDepth.infer, std_node)
+
+    def test_tf_space_to_depth_infer_divisibility_error_1(self):
+        graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NHWC'
+        graph.node['in_data_node']['shape'] = np.array([1, 1024, 577, 256])
+        std_node = Node(graph, 'StD')
+        self.assertRaises(Error, SpaceToDepth.infer, std_node)
+
+    def test_tf_space_to_depth_infer_divisibility_error_2(self):
+        graph = build_graph(nodes, edges)
+        graph.graph['layout'] = 'NCHW'
+        graph.node['in_data_node']['shape'] = np.array([1, 256, 1024, 577])
+        std_node = Node(graph, 'StD')
+        self.assertRaises(Error, SpaceToDepth.infer, std_node)
\ No newline at end of file
index 5e3737c..4f76e5e 100644 (file)
@@ -16,7 +16,6 @@
 
 import logging as log
 
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
index 7ae05fd..39b7996 100644 (file)
@@ -23,7 +23,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
-
 nodes_attributes = {'input_indices': {'shape': None, 'value': None, 'kind': 'data'},
                     'input_values': {'shape': None, 'value': None, 'kind': 'data'},
                     'dense_shape': {'shape': None, 'value': None, 'kind': 'data'},
index 9a48e61..f2595bd 100644 (file)
@@ -23,7 +23,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
-
 nodes_attributes = {'input_indices': {'shape': None, 'value': None, 'kind': 'data'},
                     'input_shape': {'shape': None, 'value': None, 'kind': 'data'},
                     'new_shape': {'shape': None, 'value': None, 'kind': 'data'},
index 363100b..8eecc7f 100644 (file)
@@ -14,9 +14,6 @@
  limitations under the License.
 """
 
-import logging as log
-
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
index 18f335c..84473c9 100644 (file)
@@ -14,9 +14,6 @@
  limitations under the License.
 """
 
-import logging as log
-
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
index 1075acf..47989ab 100644 (file)
@@ -14,9 +14,6 @@
  limitations under the License.
 """
 
-import logging as log
-
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
index 6c09db5..f58ed09 100644 (file)
@@ -23,7 +23,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
-
 # graph 1
 nodes_attributes = {
     'input_indices': {'kind': 'op', 'op': 'Parameter', 'shape': int64_array([5, 2])},
index e702496..ad7b275 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import numpy as np
-
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node, Graph
 from mo.ops.op import Op
index 3490d21..0f1b400 100644 (file)
@@ -23,7 +23,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
-
 nodes_attributes = {'input_indices': {'shape': None, 'value': None, 'kind': 'data'},
                     'input_values': {'shape': None, 'value': None, 'kind': 'data'},
                     'input_dense_shape': {'shape': None, 'value': None, 'kind': 'data'},
index a865dad..23935ed 100644 (file)
@@ -15,6 +15,7 @@
 """
 
 import logging as log
+
 import numpy as np
 
 from mo.front.common.partial_infer.utils import int64_array
index ed8e574..02ffc9e 100644 (file)
@@ -18,11 +18,11 @@ import unittest
 
 import numpy as np
 
+from extensions.ops.split import AttributedSplit, AttributedVariadicSplit
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
-from extensions.ops.split import AttributedSplit, AttributedVariadicSplit
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class TestSplitOp(unittest.TestCase):
index f8be09c..45df1af 100644 (file)
@@ -17,7 +17,6 @@
 from mo.front.common.partial_infer.elemental import copy_shape_infer
 from mo.graph.graph import Graph
 from mo.ops.op import Op
-from mo.front.common.partial_infer.utils import mark_input_bins
 
 
 class StopGradientOp(Op):
index 290e6e2..27d3660 100644 (file)
@@ -21,8 +21,8 @@ import numpy as np
 
 from extensions.ops.switch import Switch
 from mo.graph.graph import Node
-from mo.utils.unittest.graph import build_graph_with_edge_attrs, build_graph_with_attrs
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph_with_edge_attrs, build_graph_with_attrs
 
 
 class TestSwitch(unittest.TestCase):
index a37e5fc..94eebb0 100644 (file)
@@ -24,7 +24,6 @@ from extensions.ops.transpose import Transpose
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
-
 input_shape = np.array([1, 3, 224, 224])
 
 
index 26dcf84..38916f1 100644 (file)
@@ -14,9 +14,6 @@
  limitations under the License.
 """
 
-import logging as log
-
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
index fc18ab3..65da100 100644 (file)
@@ -23,7 +23,6 @@ from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.utils.unittest.graph import build_graph
 
-
 # graph 1 with two outputs: uniques and indices
 nodes_attributes = {'input': {'shape': None, 'value': None, 'kind': 'data'},
                     'unique_node': {'op': 'Unique', 'kind': 'op'},
index 173e8f0..46390b4 100644 (file)
 """
 
 import hashlib
-from defusedxml.minidom import parseString
 from xml.etree.ElementTree import Element, SubElement, tostring
 
+from defusedxml.minidom import parseString
+
 from mo.graph.graph import *
-from mo.middle.passes.convert_data_type import data_type_str_to_precision, np_data_type_to_precision
+from mo.middle.passes.convert_data_type import np_data_type_to_precision
 from mo.utils.unsupported_ops import UnsupportedOps
 from mo.utils.utils import refer_to_faq_msg
 from mo.utils.version import get_version
index 145249d..4cb3c46 100644 (file)
@@ -15,6 +15,7 @@
 """
 import logging as log
 from builtins import AttributeError
+
 from defusedxml import ElementTree
 
 from mo.front.caffe.collect_attributes import collect_attributes
index 8233fb4..d40437c 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import logging as log
-
 import numpy as np
 
 from mo.utils.error import Error
index 828d3e8..6eb896e 100644 (file)
@@ -17,7 +17,6 @@
 import logging as log
 import os
 
-import networkx as nx
 import numpy as np
 
 from mo.graph.graph import Node, Graph
index de4c170..7e0f2dc 100644 (file)
@@ -19,9 +19,9 @@ import logging as log
 # Concat infer : N - number of inputs to concat
 #                axis - dimension number for tensors concatenation
 import numpy as np
-from mo.front.common.partial_infer.utils import int64_array
 
 from mo.front.caffe.extractors.utils import get_canonical_axis_index
+from mo.front.common.partial_infer.utils import int64_array
 from mo.ops.op import PermuteAttrs
 
 
index 762b443..162cb89 100644 (file)
 """
 
 import logging as log
+
 import numpy as np
 
-from mo.front.common.layout import get_batch_dim, get_features_dim, get_height_dim, get_width_dim, shape_for_layout
+from mo.front.common.layout import get_batch_dim, get_features_dim, shape_for_layout
 from mo.graph.graph import Node
 
 
index 36711f8..e88198e 100644 (file)
 """
 
 import logging as log
+from typing import Iterable
 
 import numpy as np
 
-from typing import Iterable
-
 
 def int64_array(l: Iterable):
     return np.array(l, dtype=np.int64)
@@ -55,8 +54,16 @@ def copy_or_none(x):
 
 
 def convert_tf_padding_to_str(padding):
-    mapping = {b'SAME': 'same_upper', b'VALID': 'valid'}
-    return mapping[padding.s]
+    mapping = {'SAME': 'same_upper', 'VALID': 'valid'}
+    return mapping[padding]
+
+
+def convert_deconv_tf_padding_to_str(padding):
+    # according to the formulas for calculating "auto_pad" values of the
+    # ConvBackpropData layer in the Operation Specification,
+    # the "same_lower" value matches to the "same" value for conv_transpose layer in TensorFlow
+    mapping = {'SAME': 'same_lower', 'VALID': 'valid'}
+    return mapping[padding]
 
 
 # TODO eliminate this dependency and pass necessary function as an argument
@@ -69,10 +76,7 @@ def tf_window_op_pad_infer(input, window, stride, auto_pad, is_deconv=False):
         normalized_stride = 1 / stride
 
     if auto_pad in ['same_lower', 'same_upper']:
-        if auto_pad == 'same_upper':
-            output = np.int64(np.ceil(input / normalized_stride))
-        else:
-            output = np.int64(np.floor(input / normalized_stride))
+        output = np.int64(np.ceil(input / normalized_stride))
         residual = input % stride
         mask = residual == 0
         full_pad = window.copy()
index fd720c9..d7f3190 100644 (file)
@@ -15,8 +15,6 @@
 """
 import logging as log
 
-import networkx as nx
-
 from mo.front.subgraph_matcher import SubgraphMatch
 from mo.graph.graph import Node, merge_edge_props, Graph
 from mo.middle.pattern_match import apply_pattern
index eba1cdb..1ade5f6 100644 (file)
@@ -25,9 +25,9 @@ from mo.front.extractor import spatial_attr_getter, add_input_ops, attr_getter,
     add_output_ops
 from mo.graph.graph import Node
 from mo.utils.error import Error
+from mo.utils.ir_engine.compare_graphs import compare_graphs
 from mo.utils.unittest.extractors import FakeMultiParam
 from mo.utils.unittest.graph import build_graph, build_graph_with_edge_attrs, build_graph_with_attrs
-from mo.utils.ir_engine.compare_graphs import compare_graphs
 
 
 class FakePythonParam:
index 2a0650b..c9ab624 100644 (file)
@@ -13,8 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-from mo.front.extractor import FrontExtractorOp
 from extensions.ops.elementwise import Add
+from mo.front.extractor import FrontExtractorOp
 
 
 class AddFrontExtractor(FrontExtractorOp):
index 25df83f..9f94ad6 100644 (file)
@@ -18,7 +18,8 @@ import numpy as np
 
 from mo.front.caffe.extractors.utils import embed_input
 from mo.front.extractor import FrontExtractorOp
-from mo.front.kaldi.loader.utils import read_binary_bool_token, read_binary_integer32_token, collect_until_token, read_binary_float_token
+from mo.front.kaldi.loader.utils import read_binary_bool_token, read_binary_integer32_token, collect_until_token, \
+    read_binary_float_token
 from mo.front.kaldi.utils import read_binary_vector
 from mo.ops.scale_shift import ScaleShiftOp
 from mo.utils.error import Error
index 63b3157..c5f1e3a 100644 (file)
@@ -13,8 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-from mo.front.extractor import FrontExtractorOp
 from extensions.ops.identity import IdentityOp
+from mo.front.extractor import FrontExtractorOp
 
 
 class ClipGradientComponentFrontExtractor(FrontExtractorOp):
index 6964b52..fb6bda6 100644 (file)
@@ -16,8 +16,8 @@
 
 import numpy as np
 
-from extensions.ops.transpose import Transpose
 from extensions.ops.gather import Gather
+from extensions.ops.transpose import Transpose
 from mo.front.common.partial_infer.utils import int64_array
 from mo.front.common.replacement import FrontReplacementOp
 from mo.front.kaldi.loader.utils import read_binary_integer32_token, read_blob
index 9c1c086..f7fd97d 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 from mo.front.extractor import FrontExtractorOp
-from mo.ops.eltwise_ninputs_in_1 import EltwiseNin1
 from mo.front.kaldi.utils import read_token_value
+from mo.ops.eltwise_ninputs_in_1 import EltwiseNin1
 
 
 class ElementwiseProductComponentFrontExtractor(FrontExtractorOp):
index f0a96c1..e1a7a5e 100644 (file)
  limitations under the License.
 """
 
+from extensions.ops.MatMul import FullyConnected
 from mo.front.caffe.extractors.utils import embed_input
 from mo.front.extractor import FrontExtractorOp
 from mo.front.kaldi.loader.utils import collect_until_token
 from mo.front.kaldi.utils import read_binary_matrix
-from extensions.ops.MatMul import FullyConnected
 
 
 class LinearComponentFrontExtractor(FrontExtractorOp):
index 2c96f13..2552c4b 100644 (file)
@@ -32,9 +32,13 @@ class LSTMNonlinearityFrontExtractor(FrontExtractorOp):
 
         mapping_rule = {}
 
-        embed_input(mapping_rule, 1, 'i_weights', ifo_x_weights[0:1024])
-        embed_input(mapping_rule, 2, 'f_weights', ifo_x_weights[1024:2048])
-        embed_input(mapping_rule, 3, 'o_weights', ifo_x_weights[2048:])
+        assert len(ifo_x_weights_shape) == 2, "Unexpected shape of weights in LSTMNonLinearityComponent"
+        assert ifo_x_weights_shape[0] == 3, "Unexpected shape of weights in LSTMNonLinearityComponent"
+
+        ifo_x_weights = ifo_x_weights.reshape(ifo_x_weights_shape)
+        embed_input(mapping_rule, 1, 'i_weights', ifo_x_weights[0][:])
+        embed_input(mapping_rule, 2, 'f_weights', ifo_x_weights[1][:])
+        embed_input(mapping_rule, 3, 'o_weights', ifo_x_weights[2][:])
 
         LstmNonLinearity.update_node_stat(node, mapping_rule)
         return cls.enabled
index 377ec8a..ab15306 100644 (file)
@@ -17,7 +17,7 @@ import numpy as np
 
 from mo.front.common.extractors.utils import layout_attrs
 from mo.front.extractor import FrontExtractorOp
-from mo.front.kaldi.loader.utils import read_token_value, collect_until_whitespace, collect_until_token, \
+from mo.front.kaldi.loader.utils import read_token_value, collect_until_token, \
     read_binary_integer32_token, find_next_tag, read_placeholder
 from mo.ops.pooling import Pooling
 from mo.utils.error import Error
index aa160d7..f7250ab 100644 (file)
 
 import numpy as np
 
+from extensions.ops.normalize import NormalizeOp
 from mo.front.caffe.extractors.utils import embed_input
 from mo.front.extractor import FrontExtractorOp
 from mo.front.kaldi.loader.utils import collect_until_token, read_binary_bool_token, read_binary_integer32_token, \
-                                        read_binary_float_token
-from extensions.ops.normalize import NormalizeOp
+    read_binary_float_token
 from mo.utils.error import Error
 
 
index bbfca2b..d604141 100644 (file)
@@ -17,8 +17,8 @@
 import numpy as np
 
 from extensions.ops.normalize import NormalizeOp
-from mo.front.kaldi.extractors.normalize_component_ext import NormalizeComponentFrontExtractor
 from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
+from mo.front.kaldi.extractors.normalize_component_ext import NormalizeComponentFrontExtractor
 from mo.front.kaldi.loader.utils_test import TestKaldiUtilsLoading
 from mo.ops.op import Op
 
index d19c55b..068befe 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
+from extensions.ops.pnorm import PNormOp
 from mo.front.extractor import FrontExtractorOp
 from mo.front.kaldi.loader.utils import collect_until_token, read_binary_integer32_token, read_binary_float_token
-from extensions.ops.pnorm import PNormOp
 from mo.utils.error import Error
 
 
index e298027..6456808 100644 (file)
@@ -17,8 +17,8 @@
 import numpy as np
 
 from extensions.ops.pnorm import PNormOp
-from mo.front.kaldi.extractors.pnorm_component_ext import PNormComponentFrontExtractor
 from mo.front.kaldi.extractors.common_ext_test import KaldiFrontExtractorTest
+from mo.front.kaldi.extractors.pnorm_component_ext import PNormComponentFrontExtractor
 from mo.front.kaldi.loader.utils_test import TestKaldiUtilsLoading
 from mo.ops.op import Op
 
index a22336c..53becac 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.front.extractor import FrontExtractorOp
 from extensions.ops.activation_ops import ReLU
+from mo.front.extractor import FrontExtractorOp
 
 
 class RectifiedLinearComponentFrontExtractor(FrontExtractorOp):
index 95817ca..b471c3c 100644 (file)
  limitations under the License.
 """
 import io
-
-import numpy as np
+import logging as log
 import struct
 from io import IOBase
 
 import networkx as nx
-import logging as log
+import numpy as np
 
 from mo.front.kaldi.loader.utils import find_next_tag, read_placeholder, find_next_component, get_name_from_path, \
     find_end_of_component, end_of_nnet_tag, read_binary_integer32_token, get_parameters, read_token_value, \
index 32037f1..ff49c9c 100644 (file)
  limitations under the License.
 """
 import io
-import numpy as np
 import struct
 import unittest
 
+import numpy as np
+
 from mo.front.kaldi.loader.loader import load_topology_map, load_components
 from mo.graph.graph import Graph, Node
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 class TestKaldiModelsLoading(unittest.TestCase):
index dc1d146..06ae201 100644 (file)
  limitations under the License.
 """
 import io
-
-import numpy as np
 import os
 import struct
 
+import numpy as np
+
 from mo.utils.error import Error
 from mo.utils.utils import refer_to_faq_msg
 
index 8cbc1ca..63c4a5f 100644 (file)
  limitations under the License.
 """
 import io
-import numpy as np
-import os
 import logging as log
+import os
+
+import numpy as np
 
-from mo.front.kaldi.loader.utils import read_placeholder, read_binary_integer32_token, read_blob, read_token_value, find_next_tag
+from mo.front.kaldi.loader.utils import read_placeholder, read_binary_integer32_token, read_blob, read_token_value, \
+    find_next_tag
 from mo.utils.error import Error
 
 
index 9d1d9ed..af96c0b 100644 (file)
@@ -15,7 +15,6 @@
 """
 
 from mo.front.common.partial_infer.multi_box_prior import multi_box_prior_infer_mxnet
-from mo.utils.error import Error
 
 
 def multi_box_prior_ext(attr):
index 9d6e2d7..7914ed6 100644 (file)
@@ -16,9 +16,9 @@
 
 import mxnet as mx
 
+from extensions.ops.elementwise import Elementwise
 from mo.graph.graph import Node, Graph
 from mo.ops.const import Const
-from extensions.ops.elementwise import Elementwise
 from mo.utils.error import Error
 from mo.utils.str_to import StrTo
 from mo.utils.utils import refer_to_faq_msg
index 205a481..e47d974 100644 (file)
  limitations under the License.
 """
 
-import os
 import json
+import logging as log
+import os
 
-import numpy as np
 import mxnet as mx
-import logging as log
+import numpy as np
 
-from mo.front.mxnet.extractors.utils import get_mxnet_node_edges, load_params, init_rnn_states
 from mo.front.mxnet.extractor import common_mxnet_fields
+from mo.front.mxnet.extractors.utils import get_mxnet_node_edges, load_params, init_rnn_states
 from mo.front.mxnet.nd_to_params import build_params_file
 from mo.graph.graph import Node, Graph
 from mo.utils.error import Error
index acee183..a76809c 100644 (file)
@@ -17,6 +17,7 @@
 import os
 
 import mxnet as mx
+
 from mo.front.mxnet.extractors.utils import load_params
 
 
index 6ea9095..9e1225c 100644 (file)
@@ -20,7 +20,6 @@ from __future__ import unicode_literals
 
 import logging as log
 
-import networkx as nx
 import onnx
 
 from mo.graph.graph import fill_graph_with_nodes, Graph
index 1fc7a7c..f10466b 100644 (file)
  limitations under the License.
 """
 
-import numpy as np
 import unittest
 
+import numpy as np
+
 from mo.front.tf.extractors.utils import collect_tf_attrs, tf_tensor_content
 from mo.utils.unittest.extractors import PB
 
index ee63b8a..32b70c3 100644 (file)
@@ -15,9 +15,8 @@
 """
 import collections
 import logging as log
-from typing import List
-
 from copy import deepcopy
+from typing import List
 
 import networkx as nx
 import numpy as np
index be4ec35..46e233d 100644 (file)
 import unittest
 
 import numpy as np
-
 from generator import generator, generate
 
 from mo.graph.graph import Node, Graph, add_opoutput, dict_includes_compare_attrs
 from mo.ops.const import Const
 from mo.utils.error import Error
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes = {
     '0': {'name': 'input1', 'type': 'Identity', 'value': None, 'kind': 'op', 'op': 'Parameter'},
index cee1964..92b5edc 100644 (file)
@@ -23,13 +23,13 @@ import traceback
 from collections import OrderedDict
 
 import numpy as np
-from mo.pipeline.unified import unified_pipeline
 
 from extensions.back.SpecialNodesFinalization import RemoveConstOps, CreateConstNodesReplacement, RemoveOutputOps, \
     NormalizeTI
 from mo.graph.graph import Graph
 from mo.middle.pattern_match import for_graph_and_each_sub_graph_recursively, for_each_sub_graph_recursively
 from mo.pipeline.common import prepare_emit_ir, get_ir_version
+from mo.pipeline.unified import unified_pipeline
 from mo.utils import import_extensions
 from mo.utils.cli_parser import get_placeholder_shapes, get_tuple_values, get_model_name, \
     get_common_cli_options, get_caffe_cli_options, get_tf_cli_options, get_mxnet_cli_options, get_kaldi_cli_options, \
index eae0378..d7a75a5 100644 (file)
@@ -20,8 +20,8 @@ import numpy as np
 
 from mo.graph.graph import Node
 from mo.middle.passes.conv import convert_muladd_to_scaleshift, convert_add_or_mul_to_scaleshift
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index dbb1695..a2ead94 100644 (file)
@@ -13,8 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 """
-import re
 import logging as log
+import re
 from collections import deque
 
 import networkx as nx
index e7a9afb..0b0159a 100644 (file)
@@ -18,7 +18,7 @@ import unittest
 
 import numpy as np
 
-from mo.graph.graph import Node, Graph
+from mo.graph.graph import Node
 from mo.middle.passes.eliminate import mark_output_reachable_nodes, mark_const_producer_nodes
 from mo.utils.unittest.graph import build_graph
 
index be2c6b4..f72f653 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from mo.middle.passes.fusing.decomposition import convert_scale_shift_to_mul_add, convert_batch_norm
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index e7f6a0c..a9968da 100644 (file)
@@ -21,8 +21,8 @@ import numpy as np
 from mo.front.common.partial_infer.eltwise import eltwise_infer
 from mo.graph.graph import Node
 from mo.middle.passes.fusing.fuse_linear_ops import _fuse_mul, _fuse_add, fuse_linear_ops
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index 01bfbe1..5a7a708 100644 (file)
@@ -18,11 +18,11 @@ import logging as log
 
 import numpy as np
 
-from mo.ops.const import Const
 from extensions.ops.elementwise import Mul, Add
 from mo.graph.graph import Node, Graph
 from mo.middle.passes.fusing.helpers import get_value_in_port, \
     get_tensor_in_port
+from mo.ops.const import Const
 
 
 def _fuse_linear_sequence(graph: Graph, start_node: Node):
index ddc571c..afa4a89 100644 (file)
@@ -19,8 +19,8 @@ import unittest
 import numpy as np
 
 from mo.middle.passes.fusing.fuse_linear_seq import fuse_mul_add_sequence
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
     'placeholder_1': {'shape': None, 'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'},
index f32f59f..50d6c83 100644 (file)
@@ -23,8 +23,8 @@ from mo.front.common.partial_infer.eltwise import eltwise_infer
 from mo.middle.passes.fusing.resnet_optimization import stride_optimization
 from mo.ops.convolution import Convolution
 from mo.ops.pooling import Pooling
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 max_elt_lambda = lambda node: eltwise_infer(node, lambda a, b: np.maximum(a, b))
 
index 309ce28..2609772 100644 (file)
@@ -15,6 +15,7 @@
 """
 
 import logging as log
+
 import networkx as nx
 import numpy as np
 
index dce839e..d2e417b 100644 (file)
@@ -16,8 +16,6 @@
 
 import logging as log
 
-import numpy as np
-
 from extensions.ops.activation_ops import LeakyReLU
 from mo.graph.graph import Graph
 from mo.middle.pattern_match import apply_pattern
index 82c2e6f..3b4410d 100644 (file)
@@ -20,8 +20,8 @@ from argparse import Namespace
 import numpy as np
 
 from mo.middle.passes.mean_scale_values import move_scaleshift_to_preprocess
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {'node_1': {'type': 'Identity', 'value': None, 'kind': 'op'},
                     'node_2': {'type': 'Identity', 'value': None, 'kind': 'op'},
index 0f62a20..5ec3959 100644 (file)
 """
 
 
-from defusedxml.minidom import parseString
 from xml.etree.ElementTree import Element, SubElement, tostring
 
+from defusedxml.minidom import parseString
+
 from mo.graph.graph import Node, Graph
 
 
index 8e498be..47371a6 100644 (file)
@@ -44,12 +44,21 @@ class Convolution(Op):
 
     def backend_attrs(self):
         if self.ir_version == 10:
+            def pad_attribute_helper(node: Node, pad_type: str='begin'):
+                assert pad_type in ['begin', 'end']
+                if not node.has_valid('pad'):
+                    return None
+                pad = get_backend_pad(node.pad, node.spatial_dims, 0 if pad_type == 'begin' else 1)
+                if node.has_valid('auto_pad'):
+                    pad = [0 for _ in pad]
+                return ','.join(map(str, pad))
+
             return [
                 'auto_pad',
                 ('strides', lambda node: ','.join(map(str, node['stride'][node.spatial_dims]))),
                 ('dilations', lambda node: ','.join(map(str, node['dilation'][node.spatial_dims]))),
-                ('pads_begin', lambda node: ','.join(map(str, get_backend_pad(node.pad, node.spatial_dims, 0))) if node.has_valid('pad') else None),
-                ('pads_end', lambda node: ','.join(map(str, get_backend_pad(node.pad, node.spatial_dims, 1))) if node.has_valid('pad') else None),
+                ('pads_begin', lambda node: pad_attribute_helper(node, 'begin')),
+                ('pads_end', lambda node: pad_attribute_helper(node, 'end')),
                 ('output_padding', lambda node: ','.join(map(str, node.output_padding[node.spatial_dims])) \
                     if node.has_valid('output_padding') else None),
 
index 558bc6b..5819c54 100644 (file)
  limitations under the License.
 """
 
-import numpy as np
 import logging as log
 
+import numpy as np
+
 from mo.graph.graph import Graph
 from mo.ops.op import Op
 
index c3b4856..1ab415d 100644 (file)
@@ -21,7 +21,7 @@ from mo.ops.op import Op, PermuteAttrs
 
 class Softmax(Op):
     op = 'SoftMax'
-    enabled = True
+    enabled = False
 
     def __init__(self, graph: Graph, attrs: dict):
         super().__init__(graph, {
@@ -44,3 +44,18 @@ class Softmax(Op):
         copy_shape_infer(node)
         PermuteAttrs.create_permute_attrs(node, attrs=[('axis', 'input:0')])
 
+
+class LogSoftmax(Op):
+    op = 'LogSoftmax'
+    enabled = False
+
+    def __init__(self, graph: Graph, attrs: dict):
+        super().__init__(graph, {
+            'infer': None,
+            'kind': 'op',
+            'axis': 1,
+            'type': None,  # the operation will be replaced with a Log(Softmax(x)) sub-graph
+            'op': __class__.op,
+            'in_ports_count': 1,
+            'out_ports_count': 1,
+        }, attrs)
index 28a3399..759dcaa 100644 (file)
  limitations under the License.
 """
 
+import numpy as np
+
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.perm_inputs import PermuteInputs
 from mo.ops.op import Op
-import numpy as np
 
 
 class SpaceToBatch(Op):
index 417079e..49835d1 100644 (file)
@@ -17,9 +17,7 @@ import unittest
 
 import numpy as np
 
-from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
-from mo.ops.op import PermuteAttrs
 from mo.ops.squeeze import Squeeze
 from mo.utils.unittest.graph import build_graph
 
index 9289005..2d7da0c 100644 (file)
@@ -77,7 +77,7 @@ def permute_masks(node: Node, permutation: PermuteAttrs.Permutation, attr: str):
     if not node.has_valid(attr):
         return None
 
-    node[attr] = permute_array_with_ellipsis(node, node[attr], attr in ['begin_mask', 'end_mask'])
+    node[attr] = permute_array_with_ellipsis(node, node[attr], 0)
     return node[attr]
 
 
@@ -115,17 +115,33 @@ class StridedSlice(Op):
                                                            ('begin_mask', 'input:0', permute_masks),
                                                            ('end_mask', 'input:0', permute_masks),
                                                            ])
-            for i in range(1, len(node.in_nodes())):
-                if node.in_node(i).value is not None and len(node.in_node(0).shape) > 3:
-                    node.in_node(i).value = permute_array_with_ellipsis(node, node.in_node(i).value, 0)
+            # extend inputs according to ellipsis mask
+            in_shape = node.in_port(0).get_source().data.get_shape()
+            assert in_shape is not None, \
+                'Input shape is unknown for 0 input of node {}'.format(node.name)
+            input_rank = len(in_shape)
+            if input_rank > 3:
+                for i_port in node.in_ports().values():
+                    if i_port.idx == 0 or i_port.disconnected():
+                        continue
+                    old_value = i_port.data.get_value()
+                    # additional check for non-const input
+                    # error will be return in shape inference if non-const will be added
+                    # it is paranoid check for case if shape inference will be changed
+                    assert old_value is not None, \
+                        '{} input of {} node is not constant: \'value\' attribute for edge ' + \
+                        'contains None'.format(i_port.idx, node.name)
+                    # insert 0 for begin and end and 1 for stride
+                    new_value = permute_array_with_ellipsis(node, old_value, int(i_port.idx == 3))
+                    # set_value additionally set_shape and propagate value to Const node
+                    i_port.data.set_value(new_value)
 
             # extend masks before removing ellipsis
             if np.any(node.ellipsis_mask):
                 for attr in ["new_axis_mask", "shrink_axis_mask", "begin_mask", "end_mask"]:
                     node[attr] = int64_array(extend_mask_according_ellipsis(node.ellipsis_mask, node.shrink_axis_mask,
                                                                             len(node.out_port(0).data.get_shape()),
-                                                                            list(node[attr]),
-                                                                            attr in ["begin_mask", "end_mask"]))
+                                                                            list(node[attr]), 0))
 
             # due to permutation from nhwc to nchw we will extend all masks and inputs
             idx = np.nonzero(node.ellipsis_mask)
index 907ffc8..64821ce 100644 (file)
@@ -22,25 +22,48 @@ from mo.graph.graph import Node
 from mo.ops.op import PermuteAttrs
 from mo.ops.strided_slice import extend_mask_according_ellipsis, permute_masks, permute_array_with_ellipsis, \
     StridedSlice
+from mo.utils.error import Error
 from mo.utils.unittest.graph import build_graph
 
 nodes_attributes = {
+    'input': {
+        'kind': 'op',
+        'op': None
+    },
     'data_1': {
         'kind': 'data',
         'shape': None,
         'value': None,
     },
     'begin': {
+        'kind': 'op',
+        'op': 'Const',
+        'value': None,
+        'shape': None
+    },
+    'begin_data': {
         'kind': 'data',
         'shape': None,
         'value': np.array([]),
     },
     'end': {
+        'kind': 'op',
+        'op': 'Const',
+        'value': None,
+        'shape': None
+    },
+    'end_data': {
         'kind': 'data',
         'shape': None,
         'value': np.array([]),
     },
     'stride': {
+        'kind': 'op',
+        'op': 'Const',
+        'value': None,
+        'shape': None
+    },
+    'stride_data': {
         'kind': 'data',
         'shape': None,
         'value': np.array([]),
@@ -102,10 +125,10 @@ class TestPermutationStridedSlice(unittest.TestCase):
 
         slice_node = Node(graph, 'strided_slice')
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask')
-        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0])))
+        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 0, 0, 0])))
 
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask')
-        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0])))
+        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 0, 1, 0])))
 
     def test_permute_begin_end_long(self):
         # Testing constant path case
@@ -172,10 +195,10 @@ class TestPermutationStridedSlice(unittest.TestCase):
 
         slice_node = Node(graph, 'strided_slice')
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'begin_mask')
-        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0, 1])))
+        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 0, 0, 0, 0])))
 
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'end_mask')
-        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0, 1])))
+        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 0, 1, 0, 0])))
 
     def test_permute_begin_end_shrink(self):
         # Testing constant path case
@@ -217,20 +240,27 @@ class TestPermutationStridedSlice(unittest.TestCase):
 
         slice_node = Node(graph, 'strided_slice')
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask')
-        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 1, 0, 0])))
+        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 0, 0, 0])))
 
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask')
-        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 1, 1, 0])))
+        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([0, 0, 1, 0])))
 
     def test_permute_begin_end_ellipsis(self):
         # Testing constant path case
         graph = build_graph(nodes_attributes,
-                            [('data_1', 'strided_slice'),
-                             ('begin', 'strided_slice'),
-                             ('end', 'strided_slice'),
-                             ('stride', 'strided_slice'),
+                            [('input', 'data_1'),
+                             ('data_1', 'strided_slice'),
+                             ('begin', 'begin_data'),
+                             ('begin_data', 'strided_slice'),
+                             ('end', 'end_data'),
+                             ('end_data', 'strided_slice'),
+                             ('stride', 'stride_data'),
+                             ('stride_data', 'strided_slice'),
                              ('strided_slice', 'data_2')],
                             {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
+                             'begin': {'value': [0, 1], 'shape': [2]},
+                             'end': {'value': [1, 0], 'shape': [2]},
+                             'stride': {'value': [1, 2], 'shape': [2]},
                              'strided_slice': {'begin_mask': np.array([0, 0]), 'end_mask': np.array([1, 0]),
                                                'new_axis_mask': np.array([0]), 'shrink_axis_mask': [0],
                                                'ellipsis_mask': np.array([1, 0])},
@@ -239,20 +269,30 @@ class TestPermutationStridedSlice(unittest.TestCase):
 
         slice_node = Node(graph, 'strided_slice')
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'begin_mask')
-        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 1, 1])))
+        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 0, 0])))
 
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 3, 1, 2], inv=[0, 2, 3, 1]), 'end_mask')
-        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 1, 1])))
+        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 0, 0])))
 
     def test_permute_begin_end_ellipsis_infer(self):
         # Testing constant path case
         graph = build_graph(nodes_attributes,
-                            [('data_1', 'strided_slice'),
-                             ('begin', 'strided_slice'),
-                             ('end', 'strided_slice'),
-                             ('stride', 'strided_slice'),
+                            [('input', 'data_1'),
+                             ('data_1', 'strided_slice', {'in': 0}),
+                             ('begin', 'begin_data'),
+                             ('begin_data', 'strided_slice', {'in': 1}),
+                             ('end', 'end_data'),
+                             ('end_data', 'strided_slice', {'in': 2}),
+                             ('stride', 'stride_data'),
+                             ('stride_data', 'strided_slice', {'in': 3}),
                              ('strided_slice', 'data_2')],
                             {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
+                             'begin': {'value': [0, 1], 'shape': [2]},
+                             'end': {'value': [1, 0], 'shape': [2]},
+                             'stride': {'value': [1, 2], 'shape': [2]},
+                             'begin_data': {'value': [0, 1], 'shape': [2]},
+                             'end_data': {'value': [1, 0], 'shape': [2]},
+                             'stride_data': {'value': [1, 2], 'shape': [2]},
                              'strided_slice': {'begin_mask': np.array([0, 0]), 'end_mask': np.array([1, 0]),
                                                'new_axis_mask': np.array([0]), 'shrink_axis_mask': [0],
                                                'ellipsis_mask': np.array([1, 0])},
@@ -261,22 +301,38 @@ class TestPermutationStridedSlice(unittest.TestCase):
         graph.graph['layout'] = "NHWC"
 
         slice_node = Node(graph, 'strided_slice')
+        begin_node = Node(graph, 'begin')
+        end_node = Node(graph, 'end')
+        stride_node = Node(graph, 'stride')
         StridedSlice.infer(slice_node)
-        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 1, 1, 0])))
-        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 1, 1, 0])))
+        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 0, 0])))
+        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 0, 0])))
         self.assertTrue(np.array_equal(slice_node.shrink_axis_mask, np.array([0, 0, 0, 0])))
         self.assertTrue(np.array_equal(slice_node.new_axis_mask, np.array([0, 0, 0, 0])))
+        self.assertTrue(np.array_equal(begin_node.value, np.array([0, 1, 0, 0])))
+        self.assertTrue(np.array_equal(end_node.value, np.array([1, 0, 0, 0])))
+        self.assertTrue(np.array_equal(stride_node.value, np.array([1, 2, 1, 1])))
 
     def test_permute_begin_end_ellipsis_new(self):
         # Testing constant path case
         graph = build_graph(nodes_attributes,
-                            [('data_1', 'strided_slice'),
-                             ('begin', 'strided_slice'),
-                             ('end', 'strided_slice'),
-                             ('stride', 'strided_slice'),
+                            [('input', 'data_1'),
+                             ('data_1', 'strided_slice', {'in': 0}),
+                             ('begin', 'begin_data'),
+                             ('begin_data', 'strided_slice', {'in': 1}),
+                             ('end', 'end_data'),
+                             ('end_data', 'strided_slice', {'in': 2}),
+                             ('stride', 'stride_data'),
+                             ('stride_data', 'strided_slice', {'in': 3}),
                              ('strided_slice', 'data_2')],
                             {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
-                             'strided_slice': {'begin_mask': np.array([0, 0, 0]), 'end_mask': np.array([1, 0, 0]),
+                             'begin': {'value': [0, 1, 0], 'shape': [3]},
+                             'begin_data': {'value': [0, 1, 0], 'shape': [3]},
+                             'end': {'value': [1, 0, 1], 'shape': [3]},
+                             'end_data': {'value': [1, 0, 1], 'shape': [3]},
+                             'stride': {'value': [1, 2, 3], 'shape': [3]},
+                             'stride_data': {'value': [1, 2, 3], 'shape': [3]},
+                             'strided_slice': {'begin_mask': np.array([1, 2, 3]), 'end_mask': np.array([1, 2, 3]),
                                                'new_axis_mask': np.array([1, 0, 0]), 'shrink_axis_mask': [0],
                                                'ellipsis_mask': np.array([0, 1, 0])},
                              'data_2': {'shape': np.array([1, 1, 2, 3, 4]), 'value': None},
@@ -284,18 +340,22 @@ class TestPermutationStridedSlice(unittest.TestCase):
 
         slice_node = Node(graph, 'strided_slice')
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'begin_mask')
-        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([0, 0, 0, 1, 1])))
+        self.assertTrue(np.array_equal(slice_node.begin_mask, np.array([1, 3, 2, 0, 0])))
 
         permute_masks(slice_node, PermuteAttrs.Permutation(perm=[0, 4, 1, 2, 3], inv=[0, 2, 3, 4, 1]), 'end_mask')
-        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 0, 0, 1, 1])))
+        self.assertTrue(np.array_equal(slice_node.end_mask, np.array([1, 3, 2, 0, 0])))
 
     def test_permute_begin_end_ellipsis_new_inputs(self):
         # Testing constant path case
         graph = build_graph(nodes_attributes,
-                            [('data_1', 'strided_slice'),
-                             ('begin', 'strided_slice'),
-                             ('end', 'strided_slice'),
-                             ('stride', 'strided_slice'),
+                            [('input', 'data_1'),
+                             ('data_1', 'strided_slice', {'in': 0}),
+                             ('begin', 'begin_data'),
+                             ('begin_data', 'strided_slice', {'in': 1}),
+                             ('end', 'end_data'),
+                             ('end_data', 'strided_slice', {'in': 2}),
+                             ('stride', 'stride_data'),
+                             ('stride_data', 'strided_slice', {'in': 3}),
                              ('strided_slice', 'data_2')],
                             {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
                              'strided_slice': {'begin_mask': np.array([0, 0, 0]), 'end_mask': np.array([1, 0, 0]),
@@ -304,6 +364,9 @@ class TestPermutationStridedSlice(unittest.TestCase):
                              'begin': {'value': np.array([0, 1, 2])},
                              'end': {'value': np.array([1, 2, 3])},
                              'stride': {'value': np.array([1, 1, 1])},
+                             'begin_data': {'value': np.array([0, 1, 2])},
+                             'end_data': {'value': np.array([1, 2, 3])},
+                             'stride_data': {'value': np.array([1, 1, 1])},
                              'data_2': {'shape': np.array([1, 1, 2, 3, 4]), 'value': None},
                              })
 
@@ -342,3 +405,29 @@ class TestPermutationStridedSlice(unittest.TestCase):
         shrink_mask = extend_mask_according_ellipsis(ellipsis_mask, shrink_mask, length_shape, list(shrink_mask),
                                                      ins_value)
         self.assertEquals(shrink_mask, [0, 0, 2, 2, 1])
+
+    def test_non_const_infer(self):
+        # Testing constant path case
+        graph = build_graph(nodes_attributes,
+                            [('input', 'data_1'),
+                             ('data_1', 'strided_slice', {'in': 0}),
+                             ('data_1', 'strided_slice', {'in': 1}),
+                             ('end', 'end_data'),
+                             ('end_data', 'strided_slice', {'in': 2}),
+                             ('stride', 'stride_data'),
+                             ('stride_data', 'strided_slice', {'in': 3}),
+                             ('strided_slice', 'data_2')],
+                            {'data_1': {'shape': np.array([1, 2, 3, 4]), 'value': None},
+                             'end': {'value': [1, 0], 'shape': [2]},
+                             'stride': {'value': [1, 2], 'shape': [2]},
+                             'strided_slice': {'begin_mask': np.array([0, 0]), 'end_mask': np.array([1, 0]),
+                                               'new_axis_mask': np.array([0]), 'shrink_axis_mask': [0],
+                                               'ellipsis_mask': np.array([1, 0])},
+                             'data_2': {'shape': np.array([1, 2, 3, 4]), 'value': None},
+                             })
+        graph.graph['layout'] = "NHWC"
+
+        slice_node = Node(graph, 'strided_slice')
+        with self.assertRaises(Error) as error:
+            StridedSlice.infer(slice_node)
+        self.assertTrue('Strided slice layer supports only constant begin and end inputs' in str(error.exception))
index 170fca0..3b2c7b7 100644 (file)
@@ -22,8 +22,8 @@ from generator import generator
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Node
 from mo.ops.unsqueeze import Unsqueeze
-from mo.utils.unittest.graph import build_graph
 from mo.utils.ir_engine.compare_graphs import compare_graphs
+from mo.utils.unittest.graph import build_graph
 
 
 @generator
index f73a90b..1c940dd 100644 (file)
  limitations under the License.
 """
 import logging as log
-
-import networkx as nx
 import os
 from enum import Enum
 
+import networkx as nx
+
 from mo.graph.graph import Graph
 from mo.middle.passes.eliminate import shape_inference
 from mo.middle.pattern_match import for_graph_and_each_sub_graph_recursively
index 38f40e8..e15e5b4 100644 (file)
@@ -19,8 +19,6 @@ import logging as log
 import os
 from re import compile, match
 
-import networkx as nx
-
 from mo.graph.graph import Node, Graph
 from mo.utils.error import Error
 from mo.utils.graph import nodes_matching_name_pattern, sub_graph_between_nodes
index b55b090..9a66951 100644 (file)
  limitations under the License.
 """
 
+import logging as log
 from collections import deque
 from re import match, compile
 
-import logging as log
 import networkx as nx
 
 from mo.graph.graph import Node, Graph
index 0484d9d..c2d1e76 100644 (file)
 
 import unittest
 
-import networkx as nx
-
+from mo.graph.graph import Graph
 from mo.utils.error import Error
 from mo.utils.graph import bfs_search, is_connected_component, sub_graph_between_nodes
-from mo.graph.graph import Graph
+
 
 class TestGraphUtils(unittest.TestCase):
     def test_simple_dfs(self):
index 62d8ee2..fd6604a 100644 (file)
 """
 
 import logging as log
-from mo.utils.graph import Node
-from mo.utils import class_registration
 
 from mo.front.common.partial_infer.utils import int64_array
+from mo.utils import class_registration
+from mo.utils.graph import Node
 
 
 class Extender(object):
diff --git a/model-optimizer/mo/utils/ir_reader/extenders/GRUCell_extender.py b/model-optimizer/mo/utils/ir_reader/extenders/GRUCell_extender.py
new file mode 100644 (file)
index 0000000..b5ff5a1
--- /dev/null
@@ -0,0 +1,32 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.front.common.partial_infer.utils import mark_input_bins
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
+
+
+class GRUCell_extender(Extender):
+    op = 'GRUCell'
+
+    @staticmethod
+    def extend(op: Node):
+        if not op.has_valid('activations'):
+            op['activations'] = None
+
+        mark_input_bins(op, start_port=2)
+
+        op['need_copy_input_blobs'] = True
index 74dfca8..4ce8d8d 100644 (file)
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
 from mo.utils.graph import Node
-
-from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.ir_reader.extender import Extender
 
 
 class LSTMCell_extender(Extender):
diff --git a/model-optimizer/mo/utils/ir_reader/extenders/RNNCell_extender.py b/model-optimizer/mo/utils/ir_reader/extenders/RNNCell_extender.py
new file mode 100644 (file)
index 0000000..7fcb3ff
--- /dev/null
@@ -0,0 +1,27 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
+
+
+class RNNCell_extender(Extender):
+    op = 'RNNCell'
+
+    @staticmethod
+    def extend(op: Node):
+        if not op.has_valid('activations'):
+            op['activations'] = None
index f839966..c5f64f1 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
+from mo.utils.graph import Node
 from mo.utils.ir_reader.extender import Extender
 from mo.utils.ir_reader.extenders.conv_extender import Conv_extender
-from mo.utils.graph import Node
 
 
 class BinaryConv_extender(Extender):
index c40853c..389366b 100644 (file)
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
 from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class Conv_extender(Extender):
index 7711e84..d7b9b44 100644 (file)
 
 import numpy as np
 
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
 from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class ConvolutionBackpropData_extender(Extender):
index fde792f..b48def2 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
+from mo.utils.graph import Node
 from mo.utils.ir_reader.extender import Extender
 from mo.utils.ir_reader.extenders.conv_extender import Conv_extender
-from mo.utils.graph import Node
 
 
 class DeformableConv_extender(Extender):
index cf8cc0e..c513386 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
 from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class ExperimentalDetectronROIFeatureExtractor_extender(Extender):
index 2402bee..a1958d6 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
 from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class FakeQuantize_extender(Extender):
index da47527..a83c293 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
 from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class Interpolate_extender(Extender):
diff --git a/model-optimizer/mo/utils/ir_reader/extenders/non_zero_extender.py b/model-optimizer/mo/utils/ir_reader/extenders/non_zero_extender.py
new file mode 100644 (file)
index 0000000..d276cce
--- /dev/null
@@ -0,0 +1,27 @@
+"""
+ Copyright (C) 2018-2020 Intel Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+from mo.middle.passes.convert_data_type import destination_type_to_np_data_type
+
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
+
+
+class NonZeroExtender(Extender):
+    op = 'NonZero'
+
+    @staticmethod
+    def extend(op: Node):
+        op['output_type'] = destination_type_to_np_data_type(op.output_type)
index f13e930..35632b7 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
 from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class Pad_extender(Extender):
index a3d675c..e0b8583 100644 (file)
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
+from mo.middle.passes.convert_data_type import destination_type_to_np_data_type
 from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
-from mo.middle.passes.convert_data_type import destination_type_to_np_data_type
 
 class Parameter_extender(Extender):
     op = 'Parameter'
index 4bc94bc..e47dcca 100644 (file)
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
-from mo.graph.graph import Node
-
 from mo.front.common.partial_infer.utils import int64_array
+from mo.graph.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class AvgPool_extender(Extender):
index 9b8a03b..600f586 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 
+from mo.utils.graph import Node
 from mo.utils.ir_reader.extender import Extender
 from mo.utils.ir_reader.extenders.priorbox_extender import PriorBox_extender
-from mo.utils.graph import Node
 
 
 class PriorBoxClustered_extender(Extender):
index 82164c4..4b535cc 100644 (file)
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
 from mo.front.common.partial_infer.multi_box_prior import multi_box_prior_infer_mxnet
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class PriorBox_extender(Extender):
index d444521..d4a0524 100644 (file)
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
 from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class ReorgYolo_extender(Extender):
index 0003f11..dfc7c84 100644 (file)
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
-from mo.utils.graph import Node
-
 from mo.front.common.partial_infer.utils import int64_array
+from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class StridedSlice_extender(Extender):
index 50256d4..ef18451 100644 (file)
@@ -14,9 +14,8 @@
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
 from mo.utils.graph import Node
-
+from mo.utils.ir_reader.extender import Extender
 from mo.utils.ir_reader.layer_to_class import copy_graph_with_ops
 
 
index 195e57c..8f7c6e0 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
 from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class TopK_extender(Extender):
index c7b1cd2..cd80d0d 100644 (file)
@@ -14,8 +14,8 @@
  limitations under the License.
 """
 
-from mo.utils.ir_reader.extender import Extender
 from mo.utils.graph import Node
+from mo.utils.ir_reader.extender import Extender
 
 
 class VariadicSplit_extender(Extender):
index 3fba047..f91a072 100644 (file)
@@ -24,6 +24,7 @@ from extensions.ops.ReduceOps import ReduceOp
 from extensions.ops.activation_ops import Activation
 from extensions.ops.elementwise import Elementwise, LogicalElementwise, BiasAdd, Div, Mul, Pow, Sub
 from extensions.ops.psroipooling import DeformablePSROIPoolingOp
+from extensions.ops.scatter import Scatter
 from extensions.ops.split import Split, VariadicSplit
 from mo.front.common.partial_infer.utils import int64_array
 from mo.graph.graph import Graph, Node
@@ -62,7 +63,7 @@ def collect_ops(path: str):
     """
     import_by_path(os.path.join(path, 'mo', 'ops'), ['mo', 'ops'])
     import_by_path(os.path.join(path, 'extensions', 'ops'), ['extensions', 'ops'])
-    update_registration(classes=[Op, Activation, Elementwise, LogicalElementwise, ReduceOp],
+    update_registration(classes=[Op, Activation, Elementwise, LogicalElementwise, ReduceOp, Scatter],
                         enabled_transforms=[], disabled_transforms=[])
 
 
@@ -142,7 +143,7 @@ def propagate_const_values(op: Node):
             if weights_rounded[elem] == 0:
                 weights_rounded[elem] -= 1  # pylint: disable=unsupported-assignment-operation
         assert len(weights_rounded) % 8 == 0
-        weights_rounded = weights_rounded.reshape([len(weights_rounded) // 8, 8])   # pylint: disable=no-member
+        weights_rounded = weights_rounded.reshape([len(weights_rounded) // 8, 8])  # pylint: disable=no-member
         weights_rounded = np.flip(weights_rounded, axis=1)
         value = weights_rounded.flatten()
 
@@ -157,8 +158,8 @@ def groupconv_to_conv(op: Node):
     :param op:
     :return:
     """
-    assert op.soft_get('type') == 'GroupConvolution', 'Wrong operation type, {} instead of GroupConvolution!' \
-                                                      ''.format(op.soft_get('type'))
+    assert op.soft_get('type') == 'GroupConvolution', \
+        'Wrong operation type, {} instead of GroupConvolution!'.format(op.soft_get('type'))
 
     weights_shape = op.in_port(1).data.get_shape()
     group = weights_shape[0]
@@ -169,13 +170,13 @@ def groupconv_to_conv(op: Node):
         weights_node.value = np.reshape(weights_node.value, new_shape)
     elif weights_node.type == 'Reshape':
         # we remove reshape node added in ConvolutionWithGroupsResolver pass
-        assert weights_node.in_port(0).get_source().data.get_shape() == new_shape, 'Weight shape and calculated ' \
-                                                                'shape mismatch in GroupConv node {}.'.format(op.name)
+        assert weights_node.in_port(0).get_source().data.get_shape() == new_shape, \
+            'Weight shape and calculated shape mismatch in GroupConv node {}.'.format(op.name)
         op.in_port(1).disconnect()
         weights_node.in_port(0).get_source().get_connection().set_destination(op.in_port(1))
     else:
-        assert op.in_port(1).get_source().data.get_shape() == new_shape, 'Weight shape and calculated ' \
-                                                                'shape mismatch in GroupConv node {}.'.format(op.name)
+        assert op.in_port(1).get_source().data.get_shape() == new_shape, \
+            'Weight shape and calculated shape mismatch in GroupConv node {}.'.format(op.name)
     # we need to set this attrs for correct shape infer as convolution
     op['group'] = group
     op.type = 'Convolution'
@@ -187,7 +188,7 @@ def backprop_to_deconv(op: Node):
     :param op:
     :return:
     """
-    assert op.soft_get('type') in ('ConvolutionBackpropData', 'GroupConvolutionBackpropData'),\
+    assert op.soft_get('type') in ('ConvolutionBackpropData', 'GroupConvolutionBackpropData'), \
         'Wrong operation type, {} instead of ConvolutionBackpropData/GroupConvolutionBackpropData!' \
         ''.format(op.soft_get('type'))
 
@@ -217,6 +218,20 @@ def ti_add_edge_attrs(op: Node):
         i += 1
 
 
+def copy_input_blobs(op: Node, copy_op: Node):
+    """
+    Function copy input blob data nodes from restored graph to copied one
+    :param op: Node from restored graph
+    :param copy_op: Node from copied graph
+    :return:
+    """
+    for u, d in op.get_sorted_inputs():
+        if 'bin' in d:
+            Op.create_and_connect_input_data_node(copy_op.graph, copy_op,
+                                                  {'value': op.in_node(d['in']).value,
+                                                   'shape': op.in_node(d['in']).shape}, d)
+
+
 # Map with preprocessing functions
 preprocessing_op_nodes = {
     'Const': propagate_const_values,
@@ -276,6 +291,9 @@ def copy_graph_with_ops(graph: Graph) -> Graph:
                                                  'please check it!'.format(op_type)
             node = Op.get_op_class_by_name(op_type)(new_graph, op.attrs()).create_node()
 
+        if op.has_and_set('need_copy_input_blobs'):
+            copy_input_blobs(op, node)
+
         # Collect node connections
         mapping_of_old_idx_into_new[op.id] = node.id
         node_connections[op.id] = collect_node_outputs(op)
index 88c6d09..340b188 100644 (file)
@@ -19,7 +19,6 @@ import re
 from mo.utils.error import Error
 from mo.utils.simple_proto_parser import SimpleProtoParser
 
-
 # The list of rules how to map the value from the pipeline.config file to the dictionary with attributes.
 # The rule is either a string or a tuple with two elements. In the first case the rule string is used as a key to
 # search in the parsed pipeline.config file attributes dictionary and a key to save found value. In the second case the
index 914dd95..59948f7 100644 (file)
@@ -14,8 +14,6 @@
  limitations under the License.
 """
 
-import networkx as nx
-
 from mo.graph.graph import Graph
 from mo.middle.pattern_match import apply_pattern
 
index 887c3c3..9b1a118 100644 (file)
@@ -14,9 +14,9 @@
  limitations under the License.
 """
 from argparse import Namespace
+from copy import deepcopy
 
 import networkx as nx
-from copy import deepcopy
 
 from mo.front.common.partial_infer.utils import int64_array
 from mo.front.extractor import extract_port_from_string
index 5299159..0f2d9ea 100644 (file)
@@ -16,8 +16,6 @@
 
 import collections
 
-import networkx as nx
-
 from mo.graph.graph import Node, Graph
 
 
index e3c0b55..661244b 100644 (file)
@@ -15,8 +15,9 @@
 """
 
 import unittest
-import networkx as nx
+
 import numpy as np
+
 from mo.utils.utils import match_shapes
 
 
index 8d7d15f..227b74e 100644 (file)
 
 import unittest
 import unittest.mock as mock
-
 from unittest.mock import mock_open
+
 from mo.utils.versions_checker import get_module_version_list_from_file, parse_versions_list
 
+
 class TestingVersionsChecker(unittest.TestCase):
     @mock.patch('builtins.open', new_callable=mock_open, create=True)
     def test_get_module_version_list_from_file(self, mock_open):
index d325751..38ecd7c 100644 (file)
@@ -1,6 +1,7 @@
 coverage==4.4.2
 m2r==0.1.12
 pyenchant==1.6.11
+astroid==2.1.0
 pylint==2.1.1
 Sphinx==1.6.5
 safety==1.8.5
diff --git a/ngraph b/ngraph
index edc65ca..eaa6d35 160000 (submodule)
--- a/ngraph
+++ b/ngraph
@@ -1 +1 @@
-Subproject commit edc65ca0111f86a7e63a98f62cb17d153cc2535c
+Subproject commit eaa6d35b7ed415e02b2401b528f31960123e5b71
diff --git a/scripts/demo/README.txt b/scripts/demo/README.txt
new file mode 100644 (file)
index 0000000..a4e8fe7
--- /dev/null
@@ -0,0 +1,83 @@
+=====================================================
+Demo Scripts for Model Optimizer and Inference Engine
+=====================================================
+
+The demo scripts illustrate Intel(R) Deep Learning Deployment Toolkit usage to convert and optimize pre-trained models and perform inference.
+
+Setting Up Demos
+================
+If you are behind a proxy, set the following environment variables in the console session:
+
+On Linux* and Mac OS:
+export http_proxy=http://<proxyHost>:<proxyPort>
+export https_proxy=https://<proxyHost>:<proxyPort>
+
+On Windows* OS:
+set http_proxy=http://<proxyHost>:<proxyPort>
+set https_proxy=https://<proxyHost>:<proxyPort>
+
+Running Demos
+=============
+
+The "demo" folder contains three scripts:
+
+1. Classification demo using public SqueezeNet topology (demo_squeezenet_download_convert_run.sh|bat)
+
+2. Security barrier camera demo that showcases three models coming with the product (demo_squeezenet_download_convert_run.sh|bat)
+
+3. Benchmark demo using public SqueezeNet topology (demo_benchmark_app.sh|bat) 
+
+To run the demos, run demo_squeezenet_download_convert_run.sh or demo_security_barrier_camera.sh or demo_benchmark_app.sh (*.bat on Windows) scripts from the console without parameters, for example:
+
+./demo_squeezenet_download_convert_run.sh
+
+The script allows to specify the target device to infer on using -d <CPU|GPU|MYRIAD|FPGA> option.
+
+Classification Demo Using SqueezeNet
+====================================
+
+The demo illustrates the general workflow of using the Intel(R) Deep Learning Deployment Toolkit and performs the following:
+
+  - Downloads a public SqueezeNet model using the Model Downloader (open_model_zoo\tools\downloader\downloader.py)
+  - Installs all prerequisites required for running the Model Optimizer using the scripts from the "model_optimizer\install_prerequisites" folder
+  - Converts SqueezeNet to an IR using the Model Optimizer (model_optimizer\mo.py) via the Model Converter (open_model_zoo\tools\downloader\converter.py)
+  - Builds the Inference Engine classification_sample (inference_engine\samples\classification_sample)
+  - Runs the sample with the car.png picture located in the demo folder
+
+The sample application prints top-10 inference results for the picture.
+For more information about the Inference Engine classification sample, refer to the documentation available in the sample folder.
+
+
+Security Barrier Camera Demo
+============================
+
+The demo illustrates using the Inference Engine with pre-trained models to perform vehicle detection, vehicle attributes and license-plate recognition tasks. 
+As the sample produces visual output, it should be run in GUI mode.  
+
+The demo script does the following:
+
+- Builds the Inference Engine security barrier camera sample (inference_engine\samples\security_barrier_camera_sample)
+- Runs the sample with the car_1.bmp located in the demo folder
+
+The sample application displays the resulting frame with detections rendered as bounding boxes and text.
+
+For more information about the Inference Engine security barrier camera sample, refer to the documentation available in the sample folder.
+
+
+Benchmark Demo Using SqueezeNet
+===============================
+
+The demo illustrates how to use the Benchmark Application to estimate deep learning inference performance on supported devices.
+
+The demo script does the following:
+
+  - Downloads a public SqueezeNet model using the Model Downloader (open_model_zoo\tools\downloader\downloader.py)
+  - Installs all prerequisites required for running the Model Optimizer using the scripts from the "model_optimizer\install_prerequisites" folder
+  - Converts SqueezeNet to an IR using the Model Optimizer (model_optimizer\mo.py) via the Model Converter (open_model_zoo\tools\downloader\converter.py)
+  - Builds the Inference Engine benchmark tool (inference_engine\samples\demo_benchmark_app)
+  - Runs the tool with the car.png picture located in the demo folder
+
+The benchmark app prints performance counters, resulting latency, and throughput values.
+For more information about the Inference Engine benchmark app, refer to the documentation available in the sample folder.
\ No newline at end of file
diff --git a/scripts/demo/car.png b/scripts/demo/car.png
new file mode 100644 (file)
index 0000000..f22d8d6
Binary files /dev/null and b/scripts/demo/car.png differ
diff --git a/scripts/demo/car_1.bmp b/scripts/demo/car_1.bmp
new file mode 100644 (file)
index 0000000..111cee4
Binary files /dev/null and b/scripts/demo/car_1.bmp differ
diff --git a/scripts/demo/demo_benchmark_app.bat b/scripts/demo/demo_benchmark_app.bat
new file mode 100644 (file)
index 0000000..4b29fb3
--- /dev/null
@@ -0,0 +1,253 @@
+:: Copyright (C) 2018-2019 Intel Corporation
+:: SPDX-License-Identifier: Apache-2.0
+
+@echo off
+setlocal enabledelayedexpansion
+
+set TARGET=CPU
+set BUILD_FOLDER=%USERPROFILE%\Documents\Intel\OpenVINO
+
+:: command line arguments parsing
+:input_arguments_loop
+if not "%1"=="" (
+    if "%1"=="-d" (
+        set TARGET=%2
+        echo target = !TARGET!
+        shift
+    )
+    if "%1"=="-sample-options" (
+        set SAMPLE_OPTIONS=%2 %3 %4 %5 %6
+        echo sample_options = !SAMPLE_OPTIONS!
+        shift
+    )
+    if "%1"=="-help" (
+        echo %~n0%~x0 is benchmark demo using public SqueezeNet topology
+        echo.
+        echo Options:
+        echo -d name     Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+        exit /b
+    )
+    shift
+    goto :input_arguments_loop
+)
+
+IF "%SAMPLE_OPTIONS%"=="" (
+      set SAMPLE_OPTIONS=-niter 1000 
+)
+
+set ROOT_DIR=%~dp0
+
+set TARGET_PRECISION=FP16
+
+
+echo target_precision = !TARGET_PRECISION!
+
+set models_path=%BUILD_FOLDER%\openvino_models\models
+set models_cache=%BUILD_FOLDER%\openvino_models\cache
+set irs_path=%BUILD_FOLDER%\openvino_models\ir
+
+set model_name=squeezenet1.1
+
+set target_image_path=%ROOT_DIR%car.png
+
+if exist "%ROOT_DIR%..\..\bin\setupvars.bat" (
+    call "%ROOT_DIR%..\..\bin\setupvars.bat"
+) else (
+    echo setupvars.bat is not found, INTEL_OPENVINO_DIR can't be set
+    goto error
+)
+
+echo INTEL_OPENVINO_DIR is set to %INTEL_OPENVINO_DIR%
+
+:: Check if Python is installed
+python --version 2>NUL
+if errorlevel 1 (
+   echo Error^: Python is not installed. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+   goto error
+)
+
+:: Check if Python version is equal or higher 3.4
+for /F "tokens=* USEBACKQ" %%F IN (`python --version 2^>^&1`) DO (
+   set version=%%F
+)
+echo %var%
+
+for /F "tokens=1,2,3 delims=. " %%a in ("%version%") do (
+   set Major=%%b
+   set Minor=%%c
+)
+
+if "%Major%" geq "3" (
+   if "%Minor%" geq "5" (
+       set python_ver=okay
+   )
+)
+if not "%python_ver%"=="okay" (
+   echo Unsupported Python version. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+   goto error
+)
+
+:: install yaml python modules required for downloader.py
+pip3 install --user -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+if ERRORLEVEL 1 GOTO errorHandling
+
+set downloader_dir=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\downloader
+
+for /F "tokens=* usebackq" %%d in (
+    `python "%downloader_dir%\info_dumper.py" --name "%model_name%" ^|
+        python -c "import sys, json; print(json.load(sys.stdin)[0]['subdirectory'])"`
+) do (
+    set model_dir=%%d
+)
+
+set ir_dir=%irs_path%\%model_dir%\%target_precision%
+
+echo Download public %model_name% model
+echo python "%downloader_dir%\downloader.py" --name %model_name% --output_dir %models_path% --cache_dir %models_cache%
+python "%downloader_dir%\downloader.py" --name %model_name% --output_dir %models_path% --cache_dir %models_cache%
+echo %model_name% model downloading completed
+
+timeout 7
+
+if exist %ir_dir% (
+    echo.
+    echo Target folder %ir_dir% already exists. Skipping IR generation with Model Optimizer.
+    echo If you want to convert a model again, remove the entire %ir_dir% folder.
+    timeout 7
+    GOTO buildSample
+)
+
+echo.
+echo ###############^|^| Install Model Optimizer prerequisites ^|^|###############
+echo.
+timeout 3
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\install_prerequisites"
+call install_prerequisites_caffe.bat
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+echo.
+echo ###############^|^| Run Model Optimizer ^|^|###############
+echo.
+timeout 3
+
+::set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
+echo python "%downloader_dir%\converter.py" --mo "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\mo.py" --name "%model_name%" -d "%models_path%" -o "%irs_path%" --precisions "%TARGET_PRECISION%"
+python "%downloader_dir%\converter.py" --mo "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\mo.py" --name "%model_name%" -d "%models_path%" -o "%irs_path%" --precisions "%TARGET_PRECISION%"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:buildSample
+echo.
+echo ###############^|^| Generate VS solution for Inference Engine samples using cmake ^|^|###############
+echo.
+timeout 3
+
+if "%PROCESSOR_ARCHITECTURE%" == "AMD64" (
+   set "PLATFORM=x64"
+) else (
+   set "PLATFORM=Win32"
+)
+
+set VSWHERE="false"
+if exist "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" (
+   set VSWHERE="true"
+   cd "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer"
+) else if exist "%ProgramFiles%\Microsoft Visual Studio\Installer\vswhere.exe" (
+      set VSWHERE="true"
+      cd "%ProgramFiles%\Microsoft Visual Studio\Installer"
+) else (
+   echo "vswhere tool is not found"
+)
+
+set MSBUILD_BIN=
+set VS_PATH=
+
+if !VSWHERE! == "true" (
+   for /f "usebackq tokens=*" %%i in (`vswhere -latest -products * -requires Microsoft.Component.MSBuild -property installationPath`) do (
+      set VS_PATH=%%i
+   )
+   if exist "!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe"
+   )
+   if exist "!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe"
+   )
+   if exist "!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe"
+   )
+)
+
+if "!MSBUILD_BIN!" == "" (
+   if exist "C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=14 2015"
+   )
+   if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=15 2017"
+   )
+   if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=15 2017"
+   )
+   if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=15 2017"
+   )
+) else (
+   if not "!MSBUILD_BIN:2019=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=16 2019"
+   if not "!MSBUILD_BIN:2017=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=15 2017"
+   if not "!MSBUILD_BIN:2015=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=14 2015"
+)
+
+if "!MSBUILD_BIN!" == "" (
+   echo Build tools for Visual Studio 2015 / 2017 / 2019 cannot be found. If you use Visual Studio 2017, please download and install build tools from https://www.visualstudio.com/downloads/#build-tools-for-visual-studio-2017
+   GOTO errorHandling
+)
+
+set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_samples_build"
+
+echo Creating Visual Studio !MSBUILD_VERSION! %PLATFORM% files in %SOLUTION_DIR64%... && ^
+if exist "%SOLUTION_DIR64%\CMakeCache.txt" del "%SOLUTION_DIR64%\CMakeCache.txt"
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\samples\cpp" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio !MSBUILD_VERSION!" -A %PLATFORM% "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\samples\cpp"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+echo.
+echo ###############^|^| Build Inference Engine samples using MS Visual Studio (MSBuild.exe) ^|^|###############
+echo.
+timeout 3
+echo !MSBUILD_BIN!" Samples.sln /p:Configuration=Release /t:benchmark_app /clp:ErrorsOnly /m
+"!MSBUILD_BIN!" Samples.sln /p:Configuration=Release /t:benchmark_app /clp:ErrorsOnly /m
+
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:runSample
+echo.
+echo ###############^|^| Run Inference Engine benchmark app ^|^|###############
+echo.
+timeout 3
+copy /Y "%ROOT_DIR%%model_name%.labels" "%ir_dir%"
+cd "%SOLUTION_DIR64%\intel64\Release"
+
+echo benchmark_app.exe -i "%target_image_path%" -m "%ir_dir%\%model_name%.xml" -pc  -d  !TARGET! !SAMPLE_OPTIONS!
+benchmark_app.exe -i "%target_image_path%" -m "%ir_dir%\%model_name%.xml" -pc  -d  !TARGET! !SAMPLE_OPTIONS!
+
+if ERRORLEVEL 1 GOTO errorHandling
+
+echo.
+echo ###############^|^| Inference Engine benchmark app completed successfully ^|^|###############
+
+timeout 10
+cd "%ROOT_DIR%"
+
+goto :eof
+
+:errorHandling
+echo Error
+cd "%ROOT_DIR%"
diff --git a/scripts/demo/demo_benchmark_app.sh b/scripts/demo/demo_benchmark_app.sh
new file mode 100644 (file)
index 0000000..6ed2702
--- /dev/null
@@ -0,0 +1,225 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+. "$ROOT_DIR/utils.sh"
+
+usage() {
+    echo "Benchmark demo using public SqueezeNet topology"
+    echo "-d name     specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
+    echo "-help            print help message"
+    exit 1
+}
+
+trap 'error ${LINENO}' ERR
+
+target="CPU"
+
+# parse command line options
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+    -h | -help | --help)
+    usage
+    ;;
+    -d)
+    target="$2"
+    echo target = "${target}"
+    shift
+    ;;
+    -sample-options)
+    sampleoptions="$2 $3 $4 $5 $6"
+    echo sample-options = "${sampleoptions}"
+    shift
+    ;;
+    *)
+    # unknown option
+    ;;
+esac
+shift
+done
+
+if ([ -z "$sampleoptions" ]); then
+    sampleoptions="-niter 1000"
+fi
+
+target_precision="FP16"
+
+printf "target_precision = ${target_precision}\n"
+
+models_path="$HOME/openvino_models/models"
+models_cache="$HOME/openvino_models/cache"
+irs_path="$HOME/openvino_models/ir"
+
+model_name="squeezenet1.1"
+
+target_image_path="$ROOT_DIR/car.png"
+
+run_again="Then run the script again\n\n"
+dashes="\n\n###################################################\n\n"
+
+
+if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
+    setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
+else
+    printf "Error: setupvars.sh is not found\n"
+fi
+
+if ! . $setupvars_path ; then
+    printf "Unable to run ./setupvars.sh. Please check its presence. ${run_again}"
+    exit 1
+fi
+
+# Step 1. Download the Caffe model and the prototxt of the model
+printf "${dashes}"
+printf "\n\nDownloading the Caffe model and the prototxt"
+
+cur_path=$PWD
+
+printf "\nInstalling dependencies\n"
+
+if [[ -f /etc/centos-release ]]; then
+    DISTRO="centos"
+elif [[ -f /etc/lsb-release ]]; then
+    DISTRO="ubuntu"
+fi
+
+if [[ $DISTRO == "centos" ]]; then
+    sudo -E yum install -y centos-release-scl epel-release
+    sudo -E yum install -y gcc gcc-c++ make glibc-static glibc-devel libstdc++-static libstdc++-devel libstdc++ libgcc \
+                           glibc-static.i686 glibc-devel.i686 libstdc++-static.i686 libstdc++.i686 libgcc.i686 cmake
+
+    sudo -E rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-1.el7.nux.noarch.rpm || true
+    sudo -E yum install -y epel-release
+    sudo -E yum install -y cmake ffmpeg gstreamer1 gstreamer1-plugins-base libusbx-devel
+
+    # check installed Python version
+    if command -v python3.5 >/dev/null 2>&1; then
+        python_binary=python3.5
+        pip_binary=pip3.5
+    fi
+    if command -v python3.6 >/dev/null 2>&1; then
+        python_binary=python3.6
+        pip_binary=pip3.6
+    fi
+    if [ -z "$python_binary" ]; then
+        sudo -E yum install -y rh-python36 || true
+        . scl_source enable rh-python36
+        python_binary=python3.6
+        pip_binary=pip3.6
+    fi
+elif [[ $DISTRO == "ubuntu" ]]; then
+    sudo -E apt update
+    print_and_run sudo -E apt -y install build-essential python3-pip virtualenv cmake libcairo2-dev libpango1.0-dev libglib2.0-dev libgtk2.0-dev libswscale-dev libavcodec-dev libavformat-dev libgstreamer1.0-0 gstreamer1.0-plugins-base
+    python_binary=python3
+    pip_binary=pip3
+
+    system_ver=`cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2`
+    if [ $system_ver = "18.04" ]; then
+        sudo -E apt-get install -y libpng-dev
+    else
+        sudo -E apt-get install -y libpng12-dev
+    fi
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    # check installed Python version
+    if command -v python3.7 >/dev/null 2>&1; then
+        python_binary=python3.7
+        pip_binary=pip3.7
+    elif command -v python3.6 >/dev/null 2>&1; then
+        python_binary=python3.6
+        pip_binary=pip3.6
+    elif command -v python3.5 >/dev/null 2>&1; then
+        python_binary=python3.5
+        pip_binary=pip3.5
+    else
+        python_binary=python3
+        pip_binary=pip3
+    fi
+fi
+
+if ! command -v $python_binary &>/dev/null; then
+    printf "\n\nPython 3.5 (x64) or higher is not installed. It is required to run Model Optimizer, please install it. ${run_again}"
+    exit 1
+fi
+
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+else
+    sudo -E $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+fi
+
+downloader_dir="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader"
+
+model_dir=$("$python_binary" "$downloader_dir/info_dumper.py" --name "$model_name" |
+    "$python_binary" -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
+
+downloader_path="$downloader_dir/downloader.py"
+
+print_and_run "$python_binary" "$downloader_path" --name "$model_name" --output_dir "${models_path}" --cache_dir "${models_cache}"
+
+ir_dir="${irs_path}/${model_dir}/${target_precision}"
+
+if [ ! -e "$ir_dir" ]; then
+    # Step 2. Configure Model Optimizer
+    printf "${dashes}"
+    printf "Install Model Optimizer dependencies\n\n"
+    cd "${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/install_prerequisites"
+    . ./install_prerequisites.sh caffe
+    cd $cur_path
+
+    # Step 3. Convert a model with Model Optimizer
+    printf "${dashes}"
+    printf "Convert a model with Model Optimizer\n\n"
+
+    mo_path="${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/mo.py"
+
+    export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
+    print_and_run "$python_binary" "$downloader_dir/converter.py" --mo "$mo_path" --name "$model_name" -d "$models_path" -o "$irs_path" --precisions "$target_precision"
+else
+    printf "\n\nTarget folder ${ir_dir} already exists. Skipping IR generation  with Model Optimizer."
+    printf "If you want to convert a model again, remove the entire ${ir_dir} folder. ${run_again}"
+fi
+
+# Step 4. Build samples
+printf "${dashes}"
+printf "Build Inference Engine samples\n\n"
+
+OS_PATH=$(uname -m)
+NUM_THREADS="-j2"
+
+if [ $OS_PATH == "x86_64" ]; then
+  OS_PATH="intel64"
+  NUM_THREADS="-j8"
+fi
+
+samples_path="${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/samples/cpp"
+build_dir="$HOME/inference_engine_samples_build"
+binaries_dir="${build_dir}/${OS_PATH}/Release"
+
+if [ -e $build_dir/CMakeCache.txt ]; then
+       rm -rf $build_dir/CMakeCache.txt
+fi
+mkdir -p $build_dir
+cd $build_dir
+cmake -DCMAKE_BUILD_TYPE=Release $samples_path
+
+make $NUM_THREADS benchmark_app
+
+# Step 5. Run samples
+printf "${dashes}"
+printf "Run Inference Engine benchmark app\n\n"
+
+cd $binaries_dir
+
+cp -f $ROOT_DIR/${model_name}.labels ${ir_dir}/
+
+print_and_run ./benchmark_app -d "$target" -i "$target_image_path" -m "${ir_dir}/${model_name}.xml" -pc ${sampleoptions}
+
+printf "${dashes}"
+
+printf "Inference Engine benchmark app completed successfully.\n\n"
diff --git a/scripts/demo/demo_security_barrier_camera.bat b/scripts/demo/demo_security_barrier_camera.bat
new file mode 100644 (file)
index 0000000..648dae5
--- /dev/null
@@ -0,0 +1,213 @@
+:: Copyright (C) 2018-2019 Intel Corporation
+:: SPDX-License-Identifier: Apache-2.0
+
+@echo off
+setlocal enabledelayedexpansion
+
+set TARGET=CPU
+set SAMPLE_OPTIONS=
+set BUILD_FOLDER=%USERPROFILE%\Documents\Intel\OpenVINO
+
+:: command line arguments parsing
+:input_arguments_loop
+if not "%1"=="" (
+    if "%1"=="-d" (
+        set TARGET=%2
+        echo target = !TARGET!
+        shift
+    )
+    if "%1"=="-sample-options" (
+        set SAMPLE_OPTIONS=%2 %3 %4 %5 %6
+        echo sample_options = !SAMPLE_OPTIONS!
+        shift
+    )
+    if "%1"=="-help" (
+        echo %~n0%~x0 is security barrier camera demo that showcases three models coming with the product
+        echo.
+        echo Options:
+        echo -d name     Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+        exit /b
+    )
+    shift
+    goto :input_arguments_loop
+)
+
+set ROOT_DIR=%~dp0
+
+set target_image_path=%ROOT_DIR%car_1.bmp
+
+
+set TARGET_PRECISION=FP16
+
+
+echo target_precision = !TARGET_PRECISION!
+
+if exist "%ROOT_DIR%..\..\bin\setupvars.bat" (
+    call "%ROOT_DIR%..\..\bin\setupvars.bat"
+) else (
+    echo setupvars.bat is not found, INTEL_OPENVINO_DIR can't be set
+    goto error
+)
+
+echo INTEL_OPENVINO_DIR is set to %INTEL_OPENVINO_DIR%
+
+:: Check if Python is installed
+python --version 2>NUL
+if errorlevel 1 (
+   echo Error^: Python is not installed. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+   goto error
+)
+
+:: Check if Python version is equal or higher 3.4
+for /F "tokens=* USEBACKQ" %%F IN (`python --version 2^>^&1`) DO (
+   set version=%%F
+)
+echo %var%
+
+for /F "tokens=1,2,3 delims=. " %%a in ("%version%") do (
+   set Major=%%b
+   set Minor=%%c
+)
+
+if "%Major%" geq "3" (
+   if "%Minor%" geq "5" (
+  set python_ver=okay
+   )
+)
+if not "%python_ver%"=="okay" (
+   echo Unsupported Python version. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+   goto error
+)
+
+:: install yaml python modules required for downloader.py
+pip3 install --user -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+if ERRORLEVEL 1 GOTO errorHandling
+
+
+set models_path=%BUILD_FOLDER%\openvino_models\ir
+set models_cache=%BUILD_FOLDER%\openvino_models\cache
+
+if not exist %models_cache% (
+  mkdir %models_cache%
+)
+
+set downloader_dir=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\downloader
+
+for /F "tokens=1,2 usebackq" %%a in ("%ROOT_DIR%demo_security_barrier_camera.conf") do (
+   echo python "%downloader_dir%\downloader.py" --name "%%b" --output_dir "%models_path%" --cache_dir "%models_cache%"
+   python "%downloader_dir%\downloader.py" --name "%%b" --output_dir "%models_path%" --cache_dir "%models_cache%"
+
+   for /F "tokens=* usebackq" %%d in (
+      `python "%downloader_dir%\info_dumper.py" --name "%%b" ^|
+         python -c "import sys, json; print(json.load(sys.stdin)[0]['subdirectory'])"`
+   ) do (
+      set model_args=!model_args! %%a "%models_path%\%%d\%target_precision%\%%b.xml"
+   )
+)
+
+echo.
+echo ###############^|^| Generate VS solution for Inference Engine demos using cmake ^|^|###############
+echo.
+timeout 3
+
+if "%PROCESSOR_ARCHITECTURE%" == "AMD64" (
+   set "PLATFORM=x64"
+) else (
+   set "PLATFORM=Win32"
+)
+
+set VSWHERE="false"
+if exist "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" (
+   set VSWHERE="true"
+   cd "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer"
+) else if exist "%ProgramFiles%\Microsoft Visual Studio\Installer\vswhere.exe" (
+      set VSWHERE="true"
+      cd "%ProgramFiles%\Microsoft Visual Studio\Installer"
+) else (
+   echo "vswhere tool is not found"
+)
+
+set MSBUILD_BIN=
+set VS_PATH=
+
+if !VSWHERE! == "true" (
+   for /f "usebackq tokens=*" %%i in (`vswhere -latest -products * -requires Microsoft.Component.MSBuild -property installationPath`) do (
+      set VS_PATH=%%i
+   )
+   if exist "!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe"
+   )
+   if exist "!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe"
+   )
+   if exist "!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe"
+   )
+)
+
+if "!MSBUILD_BIN!" == "" (
+   if exist "C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=14 2015"
+   )
+   if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=15 2017"
+   )
+   if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=15 2017"
+   )
+   if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=15 2017"
+   )
+) else (
+   if not "!MSBUILD_BIN:2019=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=16 2019"
+   if not "!MSBUILD_BIN:2017=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=15 2017"
+   if not "!MSBUILD_BIN:2015=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=14 2015"
+)
+
+if "!MSBUILD_BIN!" == "" (
+   echo Build tools for Visual Studio 2015 / 2017 / 2019 cannot be found. If you use Visual Studio 2017 / 2019, please download and install build tools from https://www.visualstudio.com/downloads/#build-tools-for-visual-studio-2017
+   GOTO errorHandling
+)
+
+set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_demos_build"
+
+echo Creating Visual Studio !MSBUILD_VERSION! %PLATFORM% files in %SOLUTION_DIR64%... && ^
+if exist "%SOLUTION_DIR64%\CMakeCache.txt" del "%SOLUTION_DIR64%\CMakeCache.txt"
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\demos" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio !MSBUILD_VERSION!" -A %PLATFORM% "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\demos"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+echo.
+echo ###############^|^| Build Inference Engine demos using MS Visual Studio (MSBuild.exe) ^|^|###############
+echo.
+timeout 3
+echo "!MSBUILD_BIN!" Demos.sln /p:Configuration=Release /t:security_barrier_camera_demo /clp:ErrorsOnly /m
+"!MSBUILD_BIN!" Demos.sln /p:Configuration=Release /t:security_barrier_camera_demo /clp:ErrorsOnly /m
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:runSample
+echo.
+echo ###############^|^| Run Inference Engine security barrier camera demo ^|^|###############
+echo.
+timeout 3
+cd "%SOLUTION_DIR64%\intel64\Release"
+echo "%SOLUTION_DIR64%\intel64\Release\security_barrier_camera_demo.exe" -i "%target_image_path%" %model_args% -d !TARGET! -d_va !TARGET! -d_lpr !TARGET! !SAMPLE_OPTIONS!
+security_barrier_camera_demo.exe -i "%target_image_path%" %model_args% ^
+                                 -d !TARGET! -d_va !TARGET! -d_lpr !TARGET! !SAMPLE_OPTIONS!
+if ERRORLEVEL 1 GOTO errorHandling
+
+echo.
+echo ###############^|^| Demo completed successfully ^|^|###############
+cd "%ROOT_DIR%"
+
+goto :eof
+
+:errorHandling
+echo Error
+cd "%ROOT_DIR%"
diff --git a/scripts/demo/demo_security_barrier_camera.conf b/scripts/demo/demo_security_barrier_camera.conf
new file mode 100644 (file)
index 0000000..c283abc
--- /dev/null
@@ -0,0 +1,3 @@
+-m     vehicle-license-plate-detection-barrier-0106
+-m_lpr license-plate-recognition-barrier-0001
+-m_va  vehicle-attributes-recognition-barrier-0039
diff --git a/scripts/demo/demo_security_barrier_camera.sh b/scripts/demo/demo_security_barrier_camera.sh
new file mode 100644 (file)
index 0000000..6f3f60e
--- /dev/null
@@ -0,0 +1,201 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+. "$ROOT_DIR/utils.sh"
+
+usage() {
+    echo "Security barrier camera demo that showcases three models coming with the product"
+    echo "-d name     specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
+    echo "-help            print help message"
+    exit 1
+}
+
+trap 'error ${LINENO}' ERR
+
+target="CPU"
+
+# parse command line options
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+    -h | -help | --help)
+    usage
+    ;;
+    -d)
+    target="$2"
+    echo target = "${target}"
+    shift
+    ;;
+    -sample-options)
+    sampleoptions="$2 $3 $4 $5 $6"
+    echo sample-options = "${sampleoptions}"
+    shift
+    ;;
+    *)
+    # unknown option
+    ;;
+esac
+shift
+done
+
+
+target_image_path="$ROOT_DIR/car_1.bmp"
+
+run_again="Then run the script again\n\n"
+dashes="\n\n###################################################\n\n"
+
+if [[ -f /etc/centos-release ]]; then
+    DISTRO="centos"
+elif [[ -f /etc/lsb-release ]]; then
+    DISTRO="ubuntu"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    DISTRO="macos"
+fi
+
+if [[ $DISTRO == "centos" ]]; then
+    sudo -E yum install -y centos-release-scl epel-release
+    sudo -E yum install -y gcc gcc-c++ make glibc-static glibc-devel libstdc++-static libstdc++-devel libstdc++ libgcc \
+                           glibc-static.i686 glibc-devel.i686 libstdc++-static.i686 libstdc++.i686 libgcc.i686 cmake
+
+    sudo -E rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-1.el7.nux.noarch.rpm || true
+    sudo -E yum install -y epel-release
+    sudo -E yum install -y cmake ffmpeg gstreamer1 gstreamer1-plugins-base libusbx-devel
+
+    # check installed Python version
+    if command -v python3.5 >/dev/null 2>&1; then
+        python_binary=python3.5
+        pip_binary=pip3.5
+    fi
+    if command -v python3.6 >/dev/null 2>&1; then
+        python_binary=python3.6
+        pip_binary=pip3.6
+    fi
+    if [ -z "$python_binary" ]; then
+        sudo -E yum install -y rh-python36 || true
+        . scl_source enable rh-python36
+        python_binary=python3.6
+        pip_binary=pip3.6
+    fi
+elif [[ $DISTRO == "ubuntu" ]]; then
+    sudo -E apt update
+    print_and_run sudo -E apt -y install build-essential python3-pip virtualenv cmake libcairo2-dev libpango1.0-dev libglib2.0-dev libgtk2.0-dev libswscale-dev libavcodec-dev libavformat-dev libgstreamer1.0-0 gstreamer1.0-plugins-base
+    python_binary=python3
+    pip_binary=pip3
+
+    system_ver=`cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2`
+    if [ $system_ver = "18.04" ]; then
+        sudo -E apt-get install -y libpng-dev
+    else
+        sudo -E apt-get install -y libpng12-dev
+    fi
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    # check installed Python version
+    if command -v python3.7 >/dev/null 2>&1; then
+        python_binary=python3.7
+        pip_binary=pip3.7
+    elif command -v python3.6 >/dev/null 2>&1; then
+        python_binary=python3.6
+        pip_binary=pip3.6
+    elif command -v python3.5 >/dev/null 2>&1; then
+        python_binary=python3.5
+        pip_binary=pip3.5
+    else
+        python_binary=python3
+        pip_binary=pip3
+    fi
+fi
+
+if ! command -v $python_binary &>/dev/null; then
+    printf "\n\nPython 3.5 (x64) or higher is not installed. It is required to run Model Optimizer, please install it. ${run_again}"
+    exit 1
+fi
+
+if [[ $DISTRO == "macos" ]]; then
+    $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+else
+    sudo -E $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+fi
+
+if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
+    setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
+else
+    printf "Error: setupvars.sh is not found\n"
+fi
+if ! . $setupvars_path ; then
+    printf "Unable to run ./setupvars.sh. Please check its presence. ${run_again}"
+    exit 1
+fi
+
+# Step 1. Downloading Intel models
+printf "${dashes}"
+printf "Downloading Intel models\n\n"
+
+
+target_precision="FP16"
+
+printf "target_precision = ${target_precision}\n"
+
+downloader_dir="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader"
+
+downloader_path="$downloader_dir/downloader.py"
+models_path="$HOME/openvino_models/ir"
+models_cache="$HOME/openvino_models/cache"
+
+declare -a model_args
+
+while read -r model_opt model_name; do
+    model_subdir=$("$python_binary" "$downloader_dir/info_dumper.py" --name "$model_name" |
+        "$python_binary" -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
+
+    model_path="$models_path/$model_subdir/$target_precision/$model_name"
+
+    print_and_run "$python_binary" "$downloader_path" --name "$model_name" --output_dir "$models_path" --cache_dir "$models_cache"
+
+    model_args+=("$model_opt" "${model_path}.xml")
+done < "$ROOT_DIR/demo_security_barrier_camera.conf"
+
+# Step 2. Build samples
+printf "${dashes}"
+printf "Build Inference Engine demos\n\n"
+
+demos_path="${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/demos"
+
+if ! command -v cmake &>/dev/null; then
+    printf "\n\nCMAKE is not installed. It is required to build Inference Engine demos. Please install it. ${run_again}"
+    exit 1
+fi
+
+OS_PATH=$(uname -m)
+NUM_THREADS="-j2"
+
+if [ $OS_PATH == "x86_64" ]; then
+  OS_PATH="intel64"
+  NUM_THREADS="-j8"
+fi
+
+build_dir="$HOME/inference_engine_demos_build"
+if [ -e $build_dir/CMakeCache.txt ]; then
+       rm -rf $build_dir/CMakeCache.txt
+fi
+mkdir -p $build_dir
+cd $build_dir
+cmake -DCMAKE_BUILD_TYPE=Release $demos_path
+make $NUM_THREADS security_barrier_camera_demo
+
+# Step 3. Run samples
+printf "${dashes}"
+printf "Run Inference Engine security_barrier_camera demo\n\n"
+
+binaries_dir="${build_dir}/${OS_PATH}/Release"
+cd $binaries_dir
+
+print_and_run ./security_barrier_camera_demo -d "$target" -d_va "$target" -d_lpr "$target" -i "$target_image_path" "${model_args[@]}" ${sampleoptions}
+
+printf "${dashes}"
+printf "Demo completed successfully.\n\n"
diff --git a/scripts/demo/demo_squeezenet_download_convert_run.bat b/scripts/demo/demo_squeezenet_download_convert_run.bat
new file mode 100644 (file)
index 0000000..f9dd0e2
--- /dev/null
@@ -0,0 +1,248 @@
+:: Copyright (C) 2018-2019 Intel Corporation
+:: SPDX-License-Identifier: Apache-2.0
+
+@echo off
+setlocal enabledelayedexpansion
+
+set TARGET=CPU
+set BUILD_FOLDER=%USERPROFILE%\Documents\Intel\OpenVINO
+
+:: command line arguments parsing
+:input_arguments_loop
+if not "%1"=="" (
+    if "%1"=="-d" (
+        set TARGET=%2
+        echo target = !TARGET!
+        shift
+    )
+    if "%1"=="-sample-options" (
+        set SAMPLE_OPTIONS=%2 %3 %4 %5 %6
+        echo sample_options = !SAMPLE_OPTIONS!
+        shift
+    )
+    if "%1"=="-help" (
+        echo %~n0%~x0 is classification demo using public SqueezeNet topology
+        echo.
+        echo Options:
+        echo -d name     Specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified
+        exit /b
+    )
+    shift
+    goto :input_arguments_loop
+)
+
+set ROOT_DIR=%~dp0
+
+set TARGET_PRECISION=FP16
+
+echo target_precision = !TARGET_PRECISION!
+
+set models_path=%BUILD_FOLDER%\openvino_models\models
+set models_cache=%BUILD_FOLDER%\openvino_models\cache
+set irs_path=%BUILD_FOLDER%\openvino_models\ir
+
+set model_name=squeezenet1.1
+
+set target_image_path=%ROOT_DIR%car.png
+
+if exist "%ROOT_DIR%..\..\bin\setupvars.bat" (
+    call "%ROOT_DIR%..\..\bin\setupvars.bat"
+) else (
+    echo setupvars.bat is not found, INTEL_OPENVINO_DIR can't be set
+    goto error
+)
+
+echo INTEL_OPENVINO_DIR is set to %INTEL_OPENVINO_DIR%
+
+:: Check if Python is installed
+python --version 2>NUL
+if errorlevel 1 (
+   echo Error^: Python is not installed. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+   goto error
+)
+
+:: Check if Python version is equal or higher 3.4
+for /F "tokens=* USEBACKQ" %%F IN (`python --version 2^>^&1`) DO (
+   set version=%%F
+)
+echo %var%
+
+for /F "tokens=1,2,3 delims=. " %%a in ("%version%") do (
+   set Major=%%b
+   set Minor=%%c
+)
+
+if "%Major%" geq "3" (
+   if "%Minor%" geq "5" (
+       set python_ver=okay
+   )
+)
+if not "%python_ver%"=="okay" (
+   echo Unsupported Python version. Please install Python 3.5 ^(64-bit^) or higher from https://www.python.org/downloads/
+   goto error
+)
+
+:: install yaml python modules required for downloader.py
+pip3 install --user -r "%ROOT_DIR%..\open_model_zoo\tools\downloader\requirements.in"
+if ERRORLEVEL 1 GOTO errorHandling
+
+set downloader_dir=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\downloader
+
+for /F "tokens=* usebackq" %%d in (
+    `python "%downloader_dir%\info_dumper.py" --name "%model_name%" ^|
+        python -c "import sys, json; print(json.load(sys.stdin)[0]['subdirectory'])"`
+) do (
+    set model_dir=%%d
+)
+
+set ir_dir=%irs_path%\%model_dir%\%target_precision%
+
+echo Download public %model_name% model
+echo python "%downloader_dir%\downloader.py" --name %model_name% --output_dir %models_path% --cache_dir %models_cache%
+python "%downloader_dir%\downloader.py" --name %model_name% --output_dir %models_path% --cache_dir %models_cache%
+echo %model_name% model downloading completed
+
+timeout 7
+
+if exist %ir_dir% (
+    echo.
+    echo Target folder %ir_dir% already exists. Skipping IR generation with Model Optimizer.
+    echo If you want to convert a model again, remove the entire %ir_dir% folder.
+    timeout 7
+    GOTO buildSample
+)
+
+echo.
+echo ###############^|^| Install Model Optimizer prerequisites ^|^|###############
+echo.
+timeout 3
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\install_prerequisites"
+call install_prerequisites_caffe.bat
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+echo.
+echo ###############^|^| Run Model Optimizer ^|^|###############
+echo.
+timeout 3
+
+::set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
+echo python "%downloader_dir%\converter.py" --mo "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\mo.py" --name "%model_name%" -d "%models_path%" -o "%irs_path%" --precisions "%TARGET_PRECISION%"
+python "%downloader_dir%\converter.py" --mo "%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer\mo.py" --name "%model_name%" -d "%models_path%" -o "%irs_path%" --precisions "%TARGET_PRECISION%"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:buildSample
+echo.
+echo ###############^|^| Generate VS solution for Inference Engine samples using cmake ^|^|###############
+echo.
+timeout 3
+
+if "%PROCESSOR_ARCHITECTURE%" == "AMD64" (
+   set "PLATFORM=x64"
+) else (
+   set "PLATFORM=Win32"
+)
+
+set VSWHERE="false"
+if exist "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" (
+   set VSWHERE="true"
+   cd "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer"
+) else if exist "%ProgramFiles%\Microsoft Visual Studio\Installer\vswhere.exe" (
+      set VSWHERE="true"
+      cd "%ProgramFiles%\Microsoft Visual Studio\Installer"
+) else (
+   echo "vswhere tool is not found"
+)
+
+set MSBUILD_BIN=
+set VS_PATH=
+
+if !VSWHERE! == "true" (
+   for /f "usebackq tokens=*" %%i in (`vswhere -latest -products * -requires Microsoft.Component.MSBuild -property installationPath`) do (
+      set VS_PATH=%%i
+   )
+   if exist "!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=!VS_PATH!\MSBuild\14.0\Bin\MSBuild.exe"
+   )
+   if exist "!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=!VS_PATH!\MSBuild\15.0\Bin\MSBuild.exe"
+   )
+   if exist "!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=!VS_PATH!\MSBuild\Current\Bin\MSBuild.exe"
+   )
+)
+
+if "!MSBUILD_BIN!" == "" (
+   if exist "C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\MSBuild\14.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=14 2015"
+   )
+   if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\MSBuild\15.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=15 2017"
+   )
+   if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Professional\MSBuild\15.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=15 2017"
+   )
+   if exist "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe" (
+      set "MSBUILD_BIN=C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\MSBuild\15.0\Bin\MSBuild.exe"
+      set "MSBUILD_VERSION=15 2017"
+   )
+) else (
+   if not "!MSBUILD_BIN:2019=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=16 2019"
+   if not "!MSBUILD_BIN:2017=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=15 2017"
+   if not "!MSBUILD_BIN:2015=!"=="!MSBUILD_BIN!" set "MSBUILD_VERSION=14 2015"
+)
+
+if "!MSBUILD_BIN!" == "" (
+   echo Build tools for Visual Studio 2015 / 2017 / 2019 cannot be found. If you use Visual Studio 2017, please download and install build tools from https://www.visualstudio.com/downloads/#build-tools-for-visual-studio-2017
+   GOTO errorHandling
+)
+
+set "SOLUTION_DIR64=%BUILD_FOLDER%\inference_engine_samples_build"
+
+echo Creating Visual Studio !MSBUILD_VERSION! %PLATFORM% files in %SOLUTION_DIR64%... && ^
+if exist "%SOLUTION_DIR64%\CMakeCache.txt" del "%SOLUTION_DIR64%\CMakeCache.txt"
+cd "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\samples\cpp" && cmake -E make_directory "%SOLUTION_DIR64%" && cd "%SOLUTION_DIR64%" && cmake -G "Visual Studio !MSBUILD_VERSION!" -A %PLATFORM% "%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\samples\cpp"
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+echo.
+echo ###############^|^| Build Inference Engine samples using MS Visual Studio (MSBuild.exe) ^|^|###############
+echo.
+timeout 3
+echo !MSBUILD_BIN!" Samples.sln /p:Configuration=Release /t:classification_sample_async /clp:ErrorsOnly /m
+"!MSBUILD_BIN!" Samples.sln /p:Configuration=Release /t:classification_sample_async /clp:ErrorsOnly /m
+
+if ERRORLEVEL 1 GOTO errorHandling
+
+timeout 7
+
+:runSample
+echo.
+echo ###############^|^| Run Inference Engine classification sample ^|^|###############
+echo.
+timeout 3
+copy /Y "%ROOT_DIR%%model_name%.labels" "%ir_dir%"
+cd "%SOLUTION_DIR64%\intel64\Release"
+
+echo classification_sample_async.exe -i "%target_image_path%" -m "%ir_dir%\%model_name%.xml" -d !TARGET! !SAMPLE_OPTIONS!
+classification_sample_async.exe -i "%target_image_path%" -m "%ir_dir%\%model_name%.xml" -d !TARGET! !SAMPLE_OPTIONS!
+
+if ERRORLEVEL 1 GOTO errorHandling
+
+echo.
+echo ###############^|^| Classification demo completed successfully ^|^|###############
+
+timeout 10
+cd "%ROOT_DIR%"
+
+goto :eof
+
+:errorHandling
+echo Error
+cd "%ROOT_DIR%"
diff --git a/scripts/demo/demo_squeezenet_download_convert_run.sh b/scripts/demo/demo_squeezenet_download_convert_run.sh
new file mode 100644 (file)
index 0000000..b2acf1a
--- /dev/null
@@ -0,0 +1,221 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+. "$ROOT_DIR/utils.sh"
+
+usage() {
+    echo "Classification demo using public SqueezeNet topology"
+    echo "-d name     specify the target device to infer on; CPU, GPU, FPGA, HDDL or MYRIAD are acceptable. Sample will look for a suitable plugin for device specified"
+    echo "-help            print help message"
+    exit 1
+}
+
+trap 'error ${LINENO}' ERR
+
+target="CPU"
+
+# parse command line options
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+    -h | -help | --help)
+    usage
+    ;;
+    -d)
+    target="$2"
+    echo target = "${target}"
+    shift
+    ;;
+    -sample-options)
+    sampleoptions="$2 $3 $4 $5 $6"
+    echo sample-options = "${sampleoptions}"
+    shift
+    ;;
+    *)
+    # unknown option
+    ;;
+esac
+shift
+done
+
+target_precision="FP16"
+
+printf "target_precision = ${target_precision}\n"
+
+models_path="$HOME/openvino_models/models"
+models_cache="$HOME/openvino_models/cache"
+irs_path="$HOME/openvino_models/ir"
+
+model_name="squeezenet1.1"
+
+target_image_path="$ROOT_DIR/car.png"
+
+run_again="Then run the script again\n\n"
+dashes="\n\n###################################################\n\n"
+
+
+if [ -e "$ROOT_DIR/../../bin/setupvars.sh" ]; then
+    setupvars_path="$ROOT_DIR/../../bin/setupvars.sh"
+else
+    printf "Error: setupvars.sh is not found\n"
+fi
+
+if ! . $setupvars_path ; then
+    printf "Unable to run ./setupvars.sh. Please check its presence. ${run_again}"
+    exit 1
+fi
+
+# Step 1. Download the Caffe model and the prototxt of the model
+printf "${dashes}"
+printf "\n\nDownloading the Caffe model and the prototxt"
+
+cur_path=$PWD
+
+printf "\nInstalling dependencies\n"
+
+if [[ -f /etc/centos-release ]]; then
+    DISTRO="centos"
+elif [[ -f /etc/lsb-release ]]; then
+    DISTRO="ubuntu"
+fi
+
+if [[ $DISTRO == "centos" ]]; then
+    sudo -E yum install -y centos-release-scl epel-release
+    sudo -E yum install -y gcc gcc-c++ make glibc-static glibc-devel libstdc++-static libstdc++-devel libstdc++ libgcc \
+                           glibc-static.i686 glibc-devel.i686 libstdc++-static.i686 libstdc++.i686 libgcc.i686 cmake
+
+    sudo -E rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-1.el7.nux.noarch.rpm || true
+    sudo -E yum install -y epel-release
+    sudo -E yum install -y cmake ffmpeg gstreamer1 gstreamer1-plugins-base libusbx-devel
+
+    # check installed Python version
+    if command -v python3.5 >/dev/null 2>&1; then
+        python_binary=python3.5
+        pip_binary=pip3.5
+    fi
+    if command -v python3.6 >/dev/null 2>&1; then
+        python_binary=python3.6
+        pip_binary=pip3.6
+    fi
+    if [ -z "$python_binary" ]; then
+        sudo -E yum install -y rh-python36 || true
+        . scl_source enable rh-python36
+        python_binary=python3.6
+        pip_binary=pip3.6
+    fi
+elif [[ $DISTRO == "ubuntu" ]]; then
+    sudo -E apt update
+    print_and_run sudo -E apt -y install build-essential python3-pip virtualenv cmake libcairo2-dev libpango1.0-dev libglib2.0-dev libgtk2.0-dev libswscale-dev libavcodec-dev libavformat-dev libgstreamer1.0-0 gstreamer1.0-plugins-base
+    python_binary=python3
+    pip_binary=pip3
+
+    system_ver=`cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2`
+    if [ $system_ver = "18.04" ]; then
+        sudo -E apt-get install -y libpng-dev
+    else
+        sudo -E apt-get install -y libpng12-dev
+    fi
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    # check installed Python version
+    if command -v python3.7 >/dev/null 2>&1; then
+        python_binary=python3.7
+        pip_binary=pip3.7
+    elif command -v python3.6 >/dev/null 2>&1; then
+        python_binary=python3.6
+        pip_binary=pip3.6
+    elif command -v python3.5 >/dev/null 2>&1; then
+        python_binary=python3.5
+        pip_binary=pip3.5
+    else
+        python_binary=python3
+        pip_binary=pip3
+    fi
+fi
+
+if ! command -v $python_binary &>/dev/null; then
+    printf "\n\nPython 3.5 (x64) or higher is not installed. It is required to run Model Optimizer, please install it. ${run_again}"
+    exit 1
+fi
+
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+else
+    sudo -E $pip_binary install -r $ROOT_DIR/../open_model_zoo/tools/downloader/requirements.in
+fi
+
+downloader_dir="${INTEL_OPENVINO_DIR}/deployment_tools/open_model_zoo/tools/downloader"
+
+model_dir=$("$python_binary" "$downloader_dir/info_dumper.py" --name "$model_name" |
+    "$python_binary" -c 'import sys, json; print(json.load(sys.stdin)[0]["subdirectory"])')
+
+downloader_path="$downloader_dir/downloader.py"
+
+print_and_run "$python_binary" "$downloader_path" --name "$model_name" --output_dir "${models_path}" --cache_dir "${models_cache}"
+
+ir_dir="${irs_path}/${model_dir}/${target_precision}"
+
+if [ ! -e "$ir_dir" ]; then
+    # Step 2. Configure Model Optimizer
+    printf "${dashes}"
+    printf "Install Model Optimizer dependencies\n\n"
+    cd "${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/install_prerequisites"
+    . ./install_prerequisites.sh caffe
+    cd $cur_path
+
+    # Step 3. Convert a model with Model Optimizer
+    printf "${dashes}"
+    printf "Convert a model with Model Optimizer\n\n"
+
+    mo_path="${INTEL_OPENVINO_DIR}/deployment_tools/model_optimizer/mo.py"
+
+    export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp
+    print_and_run "$python_binary" "$downloader_dir/converter.py" --mo "$mo_path" --name "$model_name" -d "$models_path" -o "$irs_path" --precisions "$target_precision"
+else
+    printf "\n\nTarget folder ${ir_dir} already exists. Skipping IR generation  with Model Optimizer."
+    printf "If you want to convert a model again, remove the entire ${ir_dir} folder. ${run_again}"
+fi
+
+# Step 4. Build samples
+printf "${dashes}"
+printf "Build Inference Engine samples\n\n"
+
+OS_PATH=$(uname -m)
+NUM_THREADS="-j2"
+
+if [ $OS_PATH == "x86_64" ]; then
+  OS_PATH="intel64"
+  NUM_THREADS="-j8"
+fi
+
+samples_path="${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/samples/cpp"
+build_dir="$HOME/inference_engine_samples_build"
+binaries_dir="${build_dir}/${OS_PATH}/Release"
+
+if [ -e $build_dir/CMakeCache.txt ]; then
+       rm -rf $build_dir/CMakeCache.txt
+fi
+mkdir -p $build_dir
+cd $build_dir
+cmake -DCMAKE_BUILD_TYPE=Release $samples_path
+
+make $NUM_THREADS classification_sample_async
+
+# Step 5. Run samples
+printf "${dashes}"
+printf "Run Inference Engine classification sample\n\n"
+
+cd $binaries_dir
+
+cp -f $ROOT_DIR/${model_name}.labels ${ir_dir}/
+
+print_and_run ./classification_sample_async -d "$target" -i "$target_image_path" -m "${ir_dir}/${model_name}.xml" ${sampleoptions}
+
+printf "${dashes}"
+
+printf "Demo completed successfully.\n\n"
diff --git a/scripts/demo/squeezenet1.1.labels b/scripts/demo/squeezenet1.1.labels
new file mode 100644 (file)
index 0000000..a509c00
--- /dev/null
@@ -0,0 +1,1000 @@
+tench, Tinca tinca
+goldfish, Carassius auratus
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+tiger shark, Galeocerdo cuvieri
+hammerhead, hammerhead shark
+electric ray, crampfish, numbfish, torpedo
+stingray
+cock
+hen
+ostrich, Struthio camelus
+brambling, Fringilla montifringilla
+goldfinch, Carduelis carduelis
+house finch, linnet, Carpodacus mexicanus
+junco, snowbird
+indigo bunting, indigo finch, indigo bird, Passerina cyanea
+robin, American robin, Turdus migratorius
+bulbul
+jay
+magpie
+chickadee
+water ouzel, dipper
+kite
+bald eagle, American eagle, Haliaeetus leucocephalus
+vulture
+great grey owl, great gray owl, Strix nebulosa
+European fire salamander, Salamandra salamandra
+common newt, Triturus vulgaris
+eft
+spotted salamander, Ambystoma maculatum
+axolotl, mud puppy, Ambystoma mexicanum
+bullfrog, Rana catesbeiana
+tree frog, tree-frog
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+loggerhead, loggerhead turtle, Caretta caretta
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+mud turtle
+terrapin
+box turtle, box tortoise
+banded gecko
+common iguana, iguana, Iguana iguana
+American chameleon, anole, Anolis carolinensis
+whiptail, whiptail lizard
+agama
+frilled lizard, Chlamydosaurus kingi
+alligator lizard
+Gila monster, Heloderma suspectum
+green lizard, Lacerta viridis
+African chameleon, Chamaeleo chamaeleon
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+African crocodile, Nile crocodile, Crocodylus niloticus
+American alligator, Alligator mississipiensis
+triceratops
+thunder snake, worm snake, Carphophis amoenus
+ringneck snake, ring-necked snake, ring snake
+hognose snake, puff adder, sand viper
+green snake, grass snake
+king snake, kingsnake
+garter snake, grass snake
+water snake
+vine snake
+night snake, Hypsiglena torquata
+boa constrictor, Constrictor constrictor
+rock python, rock snake, Python sebae
+Indian cobra, Naja naja
+green mamba
+sea snake
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+diamondback, diamondback rattlesnake, Crotalus adamanteus
+sidewinder, horned rattlesnake, Crotalus cerastes
+trilobite
+harvestman, daddy longlegs, Phalangium opilio
+scorpion
+black and gold garden spider, Argiope aurantia
+barn spider, Araneus cavaticus
+garden spider, Aranea diademata
+black widow, Latrodectus mactans
+tarantula
+wolf spider, hunting spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse, partridge, Bonasa umbellus
+prairie chicken, prairie grouse, prairie fowl
+peacock
+quail
+partridge
+African grey, African gray, Psittacus erithacus
+macaw
+sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser, Mergus serrator
+goose
+black swan, Cygnus atratus
+tusker
+echidna, spiny anteater, anteater
+platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+wallaby, brush kangaroo
+koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+wombat
+jellyfish
+sea anemone, anemone
+brain coral
+flatworm, platyhelminth
+nematode, nematode worm, roundworm
+conch
+snail
+slug
+sea slug, nudibranch
+chiton, coat-of-mail shell, sea cradle, polyplacophore
+chambered nautilus, pearly nautilus, nautilus
+Dungeness crab, Cancer magister
+rock crab, Cancer irroratus
+fiddler crab
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+American lobster, Northern lobster, Maine lobster, Homarus americanus
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+crayfish, crawfish, crawdad, crawdaddy
+hermit crab
+isopod
+white stork, Ciconia ciconia
+black stork, Ciconia nigra
+spoonbill
+flamingo
+little blue heron, Egretta caerulea
+American egret, great white heron, Egretta albus
+bittern
+crane
+limpkin, Aramus pictus
+European gallinule, Porphyrio porphyrio
+American coot, marsh hen, mud hen, water hen, Fulica americana
+bustard
+ruddy turnstone, Arenaria interpres
+red-backed sandpiper, dunlin, Erolia alpina
+redshank, Tringa totanus
+dowitcher
+oystercatcher, oyster catcher
+pelican
+king penguin, Aptenodytes patagonica
+albatross, mollymawk
+grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+dugong, Dugong dugon
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog, Maltese terrier, Maltese
+Pekinese, Pekingese, Peke
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound, Afghan
+basset, basset hound
+beagle
+bloodhound, sleuthhound
+bluetick
+black-and-tan coonhound
+Walker hound, Walker foxhound
+English foxhound
+redbone
+borzoi, Russian wolfhound
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound, Ibizan Podenco
+Norwegian elkhound, elkhound
+otterhound, otter hound
+Saluki, gazelle hound
+Scottish deerhound, deerhound
+Weimaraner
+Staffordshire bullterrier, Staffordshire bull terrier
+American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier, Sealyham
+Airedale, Airedale terrier
+cairn, cairn terrier
+Australian terrier
+Dandie Dinmont, Dandie Dinmont terrier
+Boston bull, Boston terrier
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier, Scottish terrier, Scottie
+Tibetan terrier, chrysanthemum dog
+silky terrier, Sydney silky
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa, Lhasa apso
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla, Hungarian pointer
+English setter
+Irish setter, red setter
+Gordon setter
+Brittany spaniel
+clumber, clumber spaniel
+English springer, English springer spaniel
+Welsh springer spaniel
+cocker spaniel, English cocker spaniel, cocker
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog, bobtail
+Shetland sheepdog, Shetland sheep dog, Shetland
+collie
+Border collie
+Bouvier des Flandres, Bouviers des Flandres
+Rottweiler
+German shepherd, German shepherd dog, German police dog, alsatian
+Doberman, Doberman pinscher
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard, St Bernard
+Eskimo dog, husky
+malamute, malemute, Alaskan malamute
+Siberian husky
+dalmatian, coach dog, carriage dog
+affenpinscher, monkey pinscher, monkey dog
+basenji
+pug, pug-dog
+Leonberg
+Newfoundland, Newfoundland dog
+Great Pyrenees
+Samoyed, Samoyede
+Pomeranian
+chow, chow chow
+keeshond
+Brabancon griffon
+Pembroke, Pembroke Welsh corgi
+Cardigan, Cardigan Welsh corgi
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf, grey wolf, gray wolf, Canis lupus
+white wolf, Arctic wolf, Canis lupus tundrarum
+red wolf, maned wolf, Canis rufus, Canis niger
+coyote, prairie wolf, brush wolf, Canis latrans
+dingo, warrigal, warragal, Canis dingo
+dhole, Cuon alpinus
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+hyena, hyaena
+red fox, Vulpes vulpes
+kit fox, Vulpes macrotis
+Arctic fox, white fox, Alopex lagopus
+grey fox, gray fox, Urocyon cinereoargenteus
+tabby, tabby cat
+tiger cat
+Persian cat
+Siamese cat, Siamese
+Egyptian cat
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+lynx, catamount
+leopard, Panthera pardus
+snow leopard, ounce, Panthera uncia
+jaguar, panther, Panthera onca, Felis onca
+lion, king of beasts, Panthera leo
+tiger, Panthera tigris
+cheetah, chetah, Acinonyx jubatus
+brown bear, bruin, Ursus arctos
+American black bear, black bear, Ursus americanus, Euarctos americanus
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+sloth bear, Melursus ursinus, Ursus ursinus
+mongoose
+meerkat, mierkat
+tiger beetle
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+ground beetle, carabid beetle
+long-horned beetle, longicorn, longicorn beetle
+leaf beetle, chrysomelid
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant, emmet, pismire
+grasshopper, hopper
+cricket
+walking stick, walkingstick, stick insect
+cockroach, roach
+mantis, mantid
+cicada, cicala
+leafhopper
+lacewing, lacewing fly
+dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+damselfly
+admiral
+ringlet, ringlet butterfly
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+cabbage butterfly
+sulphur butterfly, sulfur butterfly
+lycaenid, lycaenid butterfly
+starfish, sea star
+sea urchin
+sea cucumber, holothurian
+wood rabbit, cottontail, cottontail rabbit
+hare
+Angora, Angora rabbit
+hamster
+porcupine, hedgehog
+fox squirrel, eastern fox squirrel, Sciurus niger
+marmot
+beaver
+guinea pig, Cavia cobaya
+sorrel
+zebra
+hog, pig, grunter, squealer, Sus scrofa
+wild boar, boar, Sus scrofa
+warthog
+hippopotamus, hippo, river horse, Hippopotamus amphibius
+ox
+water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+bison
+ram, tup
+bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+ibex, Capra ibex
+hartebeest
+impala, Aepyceros melampus
+gazelle
+Arabian camel, dromedary, Camelus dromedarius
+llama
+weasel
+mink
+polecat, fitch, foulmart, foumart, Mustela putorius
+black-footed ferret, ferret, Mustela nigripes
+otter
+skunk, polecat, wood pussy
+badger
+armadillo
+three-toed sloth, ai, Bradypus tridactylus
+orangutan, orang, orangutang, Pongo pygmaeus
+gorilla, Gorilla gorilla
+chimpanzee, chimp, Pan troglodytes
+gibbon, Hylobates lar
+siamang, Hylobates syndactylus, Symphalangus syndactylus
+guenon, guenon monkey
+patas, hussar monkey, Erythrocebus patas
+baboon
+macaque
+langur
+colobus, colobus monkey
+proboscis monkey, Nasalis larvatus
+marmoset
+capuchin, ringtail, Cebus capucinus
+howler monkey, howler
+titi, titi monkey
+spider monkey, Ateles geoffroyi
+squirrel monkey, Saimiri sciureus
+Madagascar cat, ring-tailed lemur, Lemur catta
+indri, indris, Indri indri, Indri brevicaudatus
+Indian elephant, Elephas maximus
+African elephant, Loxodonta africana
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+barracouta, snoek
+eel
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+rock beauty, Holocanthus tricolor
+anemone fish
+sturgeon
+gar, garfish, garpike, billfish, Lepisosteus osseus
+lionfish
+puffer, pufferfish, blowfish, globefish
+abacus
+abaya
+academic gown, academic robe, judge's robe
+accordion, piano accordion, squeeze box
+acoustic guitar
+aircraft carrier, carrier, flattop, attack aircraft carrier
+airliner
+airship, dirigible
+altar
+ambulance
+amphibian, amphibious vehicle
+analog clock
+apiary, bee house
+apron
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+assault rifle, assault gun
+backpack, back pack, knapsack, packsack, rucksack, haversack
+bakery, bakeshop, bakehouse
+balance beam, beam
+balloon
+ballpoint, ballpoint pen, ballpen, Biro
+Band Aid
+banjo
+bannister, banister, balustrade, balusters, handrail
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel, cask
+barrow, garden cart, lawn cart, wheelbarrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap, swimming cap
+bath towel
+bathtub, bathing tub, bath, tub
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+beacon, lighthouse, beacon light, pharos
+beaker
+bearskin, busby, shako
+beer bottle
+beer glass
+bell cote, bell cot
+bib
+bicycle-built-for-two, tandem bicycle, tandem
+bikini, two-piece
+binder, ring-binder
+binoculars, field glasses, opera glasses
+birdhouse
+boathouse
+bobsled, bobsleigh, bob
+bolo tie, bolo, bola tie, bola
+bonnet, poke bonnet
+bookcase
+bookshop, bookstore, bookstall
+bottlecap
+bow
+bow tie, bow-tie, bowtie
+brass, memorial tablet, plaque
+brassiere, bra, bandeau
+breakwater, groin, groyne, mole, bulwark, seawall, jetty
+breastplate, aegis, egis
+broom
+bucket, pail
+buckle
+bulletproof vest
+bullet train, bullet
+butcher shop, meat market
+cab, hack, taxi, taxicab
+caldron, cauldron
+candle, taper, wax light
+cannon
+canoe
+can opener, tin opener
+cardigan
+car mirror
+carousel, carrousel, merry-go-round, roundabout, whirligig
+carpenter's kit, tool kit
+carton
+car wheel
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello, violoncello
+cellular telephone, cellular phone, cellphone, cell, mobile phone
+chain
+chainlink fence
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+chain saw, chainsaw
+chest
+chiffonier, commode
+chime, bell, gong
+china cabinet, china closet
+Christmas stocking
+church, church building
+cinema, movie theater, movie theatre, movie house, picture palace
+cleaver, meat cleaver, chopper
+cliff dwelling
+cloak
+clog, geta, patten, sabot
+cocktail shaker
+coffee mug
+coffeepot
+coil, spiral, volute, whorl, helix
+combination lock
+computer keyboard, keypad
+confectionery, confectionary, candy store
+container ship, containership, container vessel
+convertible
+corkscrew, bottle screw
+cornet, horn, trumpet, trump
+cowboy boot
+cowboy hat, ten-gallon hat
+cradle
+crane
+crash helmet
+crate
+crib, cot
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam, dike, dyke
+desk
+desktop computer
+dial telephone, dial phone
+diaper, nappy, napkin
+digital clock
+digital watch
+dining table, board
+dishrag, dishcloth
+dishwasher, dish washer, dishwashing machine
+disk brake, disc brake
+dock, dockage, docking facility
+dogsled, dog sled, dog sleigh
+dome
+doormat, welcome mat
+drilling platform, offshore rig
+drum, membranophone, tympan
+drumstick
+dumbbell
+Dutch oven
+electric fan, blower
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa, boa
+file, file cabinet, filing cabinet
+fireboat
+fire engine, fire truck
+fire screen, fireguard
+flagpole, flagstaff
+flute, transverse flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn, horn
+frying pan, frypan, skillet
+fur coat
+garbage truck, dustcart
+gasmask, respirator, gas helmet
+gas pump, gasoline pump, petrol pump, island dispenser
+goblet
+go-kart
+golf ball
+golfcart, golf cart
+gondola
+gong, tam-tam
+gown
+grand piano, grand
+greenhouse, nursery, glasshouse
+grille, radiator grille
+grocery store, grocery, food market, market
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower, blow dryer, blow drier, hair dryer, hair drier
+hand-held computer, hand-held microcomputer
+handkerchief, hankie, hanky, hankey
+hard disc, hard disk, fixed disk
+harmonica, mouth organ, harp, mouth harp
+harp
+harvester, reaper
+hatchet
+holster
+home theater, home theatre
+honeycomb
+hook, claw
+hoopskirt, crinoline
+horizontal bar, high bar
+horse cart, horse-cart
+hourglass
+iPod
+iron, smoothing iron
+jack-o'-lantern
+jean, blue jean, denim
+jeep, landrover
+jersey, T-shirt, tee shirt
+jigsaw puzzle
+jinrikisha, ricksha, rickshaw
+joystick
+kimono
+knee pad
+knot
+lab coat, laboratory coat
+ladle
+lampshade, lamp shade
+laptop, laptop computer
+lawn mower, mower
+lens cap, lens cover
+letter opener, paper knife, paperknife
+library
+lifeboat
+lighter, light, igniter, ignitor
+limousine, limo
+liner, ocean liner
+lipstick, lip rouge
+Loafer
+lotion
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+loupe, jeweler's loupe
+lumbermill, sawmill
+magnetic compass
+mailbag, postbag
+mailbox, letter box
+maillot
+maillot, tank suit
+manhole cover
+maraca
+marimba, xylophone
+mask
+matchstick
+maypole
+maze, labyrinth
+measuring cup
+medicine chest, medicine cabinet
+megalith, megalithic structure
+microphone, mike
+microwave, microwave oven
+military uniform
+milk can
+minibus
+miniskirt, mini
+minivan
+missile
+mitten
+mixing bowl
+mobile home, manufactured home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter, scooter
+mountain bike, all-terrain bike, off-roader
+mountain tent
+mouse, computer mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook, notebook computer
+obelisk
+oboe, hautboy, hautbois
+ocarina, sweet potato
+odometer, hodometer, mileometer, milometer
+oil filter
+organ, pipe organ
+oscilloscope, scope, cathode-ray oscilloscope, CRO
+overskirt
+oxcart
+oxygen mask
+packet
+paddle, boat paddle
+paddlewheel, paddle wheel
+padlock
+paintbrush
+pajama, pyjama, pj's, jammies
+palace
+panpipe, pandean pipe, syrinx
+paper towel
+parachute, chute
+parallel bars, bars
+park bench
+parking meter
+passenger car, coach, carriage
+patio, terrace
+pay-phone, pay-station
+pedestal, plinth, footstall
+pencil box, pencil case
+pencil sharpener
+perfume, essence
+Petri dish
+photocopier
+pick, plectrum, plectron
+pickelhaube
+picket fence, paling
+pickup, pickup truck
+pier
+piggy bank, penny bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate, pirate ship
+pitcher, ewer
+plane, carpenter's plane, woodworking plane
+planetarium
+plastic bag
+plate rack
+plow, plough
+plunger, plumber's helper
+Polaroid camera, Polaroid Land camera
+pole
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+poncho
+pool table, billiard table, snooker table
+pop bottle, soda bottle
+pot, flowerpot
+potter's wheel
+power drill
+prayer rug, prayer mat
+printer
+prison, prison house
+projectile, missile
+projector
+puck, hockey puck
+punching bag, punch bag, punching ball, punchball
+purse
+quill, quill pen
+quilt, comforter, comfort, puff
+racer, race car, racing car
+racket, racquet
+radiator
+radio, wireless
+radio telescope, radio reflector
+rain barrel
+recreational vehicle, RV, R.V.
+reel
+reflex camera
+refrigerator, icebox
+remote control, remote
+restaurant, eating house, eating place, eatery
+revolver, six-gun, six-shooter
+rifle
+rocking chair, rocker
+rotisserie
+rubber eraser, rubber, pencil eraser
+rugby ball
+rule, ruler
+running shoe
+safe
+safety pin
+saltshaker, salt shaker
+sandal
+sarong
+sax, saxophone
+scabbard
+scale, weighing machine
+school bus
+schooner
+scoreboard
+screen, CRT screen
+screw
+screwdriver
+seat belt, seatbelt
+sewing machine
+shield, buckler
+shoe shop, shoe-shop, shoe store
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule, slipstick
+sliding door
+slot, one-armed bandit
+snorkel
+snowmobile
+snowplow, snowplough
+soap dispenser
+soccer ball
+sock
+solar dish, solar collector, solar furnace
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web, spider's web
+spindle
+sports car, sport car
+spotlight, spot
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch, stop watch
+stove
+strainer
+streetcar, tram, tramcar, trolley, trolley car
+stretcher
+studio couch, day bed
+stupa, tope
+submarine, pigboat, sub, U-boat
+suit, suit of clothes
+sundial
+sunglass
+sunglasses, dark glasses, shades
+sunscreen, sunblock, sun blocker
+suspension bridge
+swab, swob, mop
+sweatshirt
+swimming trunks, bathing trunks
+swing
+switch, electric switch, electrical switch
+syringe
+table lamp
+tank, army tank, armored combat vehicle, armoured combat vehicle
+tape player
+teapot
+teddy, teddy bear
+television, television system
+tennis ball
+thatch, thatched roof
+theater curtain, theatre curtain
+thimble
+thresher, thrasher, threshing machine
+throne
+tile roof
+toaster
+tobacco shop, tobacconist shop, tobacconist
+toilet seat
+torch
+totem pole
+tow truck, tow car, wrecker
+toyshop
+tractor
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+tray
+trench coat
+tricycle, trike, velocipede
+trimaran
+tripod
+triumphal arch
+trolleybus, trolley coach, trackless trolley
+trombone
+tub, vat
+turnstile
+typewriter keyboard
+umbrella
+unicycle, monocycle
+upright, upright piano
+vacuum, vacuum cleaner
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin, fiddle
+volleyball
+waffle iron
+wall clock
+wallet, billfold, notecase, pocketbook
+wardrobe, closet, press
+warplane, military plane
+washbasin, handbasin, washbowl, lavabo, wash-hand basin
+washer, automatic washer, washing machine
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool, woolen, woollen
+worm fence, snake fence, snake-rail fence, Virginia fence
+wreck
+yawl
+yurt
+web site, website, internet site, site
+comic book
+crossword puzzle, crossword
+street sign
+traffic light, traffic signal, stoplight
+book jacket, dust cover, dust jacket, dust wrapper
+menu
+plate
+guacamole
+consomme
+hot pot, hotpot
+trifle
+ice cream, icecream
+ice lolly, lolly, lollipop, popsicle
+French loaf
+bagel, beigel
+pretzel
+cheeseburger
+hotdog, hot dog, red hot
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini, courgette
+spaghetti squash
+acorn squash
+butternut squash
+cucumber, cuke
+artichoke, globe artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple, ananas
+banana
+jackfruit, jak, jack
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce, chocolate syrup
+dough
+meat loaf, meatloaf
+pizza, pizza pie
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff, drop, drop-off
+coral reef
+geyser
+lakeside, lakeshore
+promontory, headland, head, foreland
+sandbar, sand bar
+seashore, coast, seacoast, sea-coast
+valley, vale
+volcano
+ballplayer, baseball player
+groom, bridegroom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+corn
+acorn
+hip, rose hip, rosehip
+buckeye, horse chestnut, conker
+coral fungus
+agaric
+gyromitra
+stinkhorn, carrion fungus
+earthstar
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+bolete
+ear, spike, capitulum
+toilet tissue, toilet paper, bathroom tissue
diff --git a/scripts/demo/utils.sh b/scripts/demo/utils.sh
new file mode 100644 (file)
index 0000000..f4d0ea9
--- /dev/null
@@ -0,0 +1,16 @@
+error() {
+    local code="${3:-1}"
+    if [[ -n "$2" ]]; then
+        echo "Error on or near line $1: $2; exiting with status ${code}"
+    else
+        echo "Error on or near line $1; exiting with status ${code}"
+    fi
+    exit "${code}"
+}
+
+print_and_run() {
+    printf 'Run'
+    printf ' %q' "$@"
+    printf '\n\n'
+    "$@"
+}
diff --git a/scripts/install_dependencies/install_4_14_kernel.sh b/scripts/install_dependencies/install_4_14_kernel.sh
new file mode 100644 (file)
index 0000000..cbab52c
--- /dev/null
@@ -0,0 +1,44 @@
+#!/bin/bash -x
+
+# Copyright (c) 2018-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script installs Linux kernel 4.14 required for Intel NEO OpenCL driver on Ubuntu and CentOS
+
+if grep -i "rhel" /etc/os-release &>/dev/null; then
+       # Cent OS
+       echo "install kernel build dependencies"
+       sudo -E yum install -y git gcc gcc-c++ ncurses-devel openssl-devel bc xz elfutils-libelf-devel xorg-x11-drv-nouveau rpm-build
+
+       echo "download 4.14.20 kernel"
+       if [ ! -f ./linux-4.14.20.tar.xz ]; then
+               wget https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.14.20.tar.xz
+       fi
+
+       tar -xJf linux-4.14.20.tar.xz
+       cd linux-4.14.20
+       echo "build 4.14.20 kernel"
+       make olddefconfig
+
+       make -j 8 binrpm-pkg
+       cd ~/rpmbuild/RPMS/x86_64
+       sudo -E yum -y localinstall *.rpm
+       sudo -E grub2-set-default 0
+
+elif grep -i "ubuntu" /etc/os-release &>/dev/null; then
+       # Ubuntu
+       sudo -E add-apt-repository ppa:teejee2008/ppa
+       sudo -E apt-get update && sudo apt-get install -y ukuu
+       sudo -E ukuu --install v4.14.20
+fi
\ No newline at end of file
diff --git a/scripts/install_dependencies/install_GST_dependencies.sh b/scripts/install_dependencies/install_GST_dependencies.sh
new file mode 100644 (file)
index 0000000..994bd15
--- /dev/null
@@ -0,0 +1,189 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [ $EUID -ne 0 ]; then
+    echo "ERROR: this script must be run as root to install 3rd party packages." >&2
+    echo "Please try again with \"sudo -E $0\", or as root." >&2
+    exit 1
+fi
+
+params=$@
+
+yes_or_no() {
+    if [ "$params" == "-y" ]; then
+        return 0
+    fi
+
+    while true; do
+        read -p "Add third-party repositories and install GStreamer Plugins (y/n): " yn
+        case $yn in
+            [Yy]*) return 0  ;;
+            [Nn]*) return  1 ;;
+        esac
+    done
+}
+
+echo
+echo "This script installs the following GStreamer 3rd-party dependencies:"
+echo "  1. build dependencies for GStreamer plugin bad"
+echo "  2. build dependencies for GStreamer plugin ugly"
+echo "  3. build dependencies for GStreamer plugin vaapi"
+echo
+
+if [ -f /etc/lsb-release ]; then
+    # Ubuntu
+    PKGS=(
+        libbluetooth-dev
+        libusb-1.0.0-dev
+        libass-dev
+        libbs2b-dev
+        libchromaprint-dev
+        liblcms2-dev
+        libssh2-1-dev
+        libdc1394-22-dev
+        libdirectfb-dev
+        libssh-dev
+        libdca-dev
+        libfaac-dev
+        libfaad-dev
+        libfdk-aac-dev
+        flite1-dev
+        libfluidsynth-dev
+        libgme-dev
+        libgsm1-dev
+        nettle-dev
+        libkate-dev
+        liblrdf0-dev
+        libde265-dev
+        libmjpegtools-dev
+        libmms-dev
+        libmodplug-dev
+        libmpcdec-dev
+        libneon27-dev
+        libofa0-dev
+        libopenal-dev
+        libopenexr-dev
+        libopenjp2-7-dev
+        libopenmpt-dev
+        libopenni2-dev
+        libdvdnav-dev
+        librtmp-dev
+        librsvg2-dev
+        libsbc-dev
+        libsndfile1-dev
+        libsoundtouch-dev
+        libspandsp-dev
+        libsrtp2-dev
+        libzvbi-dev
+        libvo-aacenc-dev
+        libvo-amrwbenc-dev
+        libwebrtc-audio-processing-dev
+        libwebp-dev
+        libwildmidi-dev
+        libzbar-dev
+        libnice-dev
+        libx265-dev
+        libxkbcommon-dev
+        libx264-dev
+        libmpeg2-4-dev
+        libdvdread-dev
+        libcdio-dev
+        libopencore-amrnb-dev
+        libopencore-amrwb-dev
+        liba52-0.7.4-dev
+        libsidplay1-dev
+        libva-dev
+        libxrandr-dev
+        libudev-dev
+        python-gi-dev \
+        python3-dev
+    )
+    apt update
+    apt install -y ${PKGS[@]}
+else
+    # CentOS
+    PKGS=(
+        bluez-libs-devel
+        libusb-devel
+        libass-devel
+        libbs2b-devel
+        libchromaprint-devel
+        lcms2-devel
+        libssh2-devel
+        libdc1394-devel
+        libXext-devel
+        libssh-devel
+        libdca-devel
+        faac-devel
+        faad2-devel
+        fdk-aac-devel
+        flite-devel
+        fluidsynth-devel
+        game-music-emu-devel
+        gsm-devel
+        nettle-devel
+        kate-devel
+        liblrdf-devel
+        libde265-devel
+        mjpegtools-devel
+        libmms-devel
+        libmodplug-devel
+        libmpcdec-devel
+        neon-devel
+        libofa-devel
+        openal-soft-devel
+        OpenEXR-devel
+        openjpeg2-devel
+        openni-devel
+        libdvdnav-devel
+        librtmp-devel
+        librsvg2-devel
+        sbc-devel
+        libsndfile-devel
+        soundtouch-devel
+        spandsp-devel
+        libsrtp-devel
+        zvbi-devel
+        vo-amrwbenc-devel
+        webrtc-audio-processing-devel
+        wildmidi-devel
+        zbar-devel
+        libnice-devel
+        x265-devel
+        libxkbcommon-devel
+        x264-devel
+        libmpeg2-devel
+        libcdio-devel
+        opencore-amr-devel
+        libva-devel
+        python36-gobject-devel
+        python3-devel
+    )
+    if yes_or_no; then
+        rpm --import http://li.nux.ro/download/nux/RPM-GPG-KEY-nux.ro
+        yum install -y epel-release
+        rpm -Uvh http://li.nux.ro/download/nux/dextop/el7/x86_64/nux-dextop-release-0-5.el7.nux.noarch.rpm
+        yum install -y ${PKGS[@]}
+    else
+        echo
+        echo "Plugins cannot be installed without adding repositories:"
+        echo "     PM-GPG-KEY-nux, epel-release, nux-dextop-release-0-5."
+        echo
+    fi
+    exit
+fi
diff --git a/scripts/install_dependencies/install_NCS_udev_rules.sh b/scripts/install_dependencies/install_NCS_udev_rules.sh
new file mode 100644 (file)
index 0000000..029e12d
--- /dev/null
@@ -0,0 +1,35 @@
+# Copyright (c) 2018-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Updating udev rules..."
+
+if [ -z "$INTEL_OPENVINO_DIR" ]; then
+    echo "Please set up your environment. Run 'source <OPENVINO_INSTALLDIR>/bin/setupvars.sh'."
+    exit -1
+fi
+
+if [ -f "$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/external/97-myriad-usbboot.rules" ]; then
+    sudo usermod -a -G users "$(whoami)"
+
+    sudo cp "$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/external/97-myriad-usbboot.rules" /etc/udev/rules.d/
+    sudo udevadm control --reload-rules
+    sudo udevadm trigger
+    sudo ldconfig
+    echo "Udev rules have been successfully installed."
+else
+    echo "File '97-myriad-usbboot.rules' is missing. Please make sure you installed 'Inference Engine Runtime for Intel® Movidiusâ„¢ VPU'."
+    exit -1
+fi 
+
+
diff --git a/scripts/install_dependencies/install_NEO_OCL_driver.sh b/scripts/install_dependencies/install_NEO_OCL_driver.sh
new file mode 100644 (file)
index 0000000..c196d27
--- /dev/null
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+# Copyright (c) 2018 - 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Installs the Graphics Driver for OpenCL on Linux.
+#
+# Usage: sudo -E ./install_NEO_OCL_driver.sh
+#
+# Supported platforms:
+#     6th, 7th, 8th or 9th generation Intel® processor with Intel(R)
+#     Processor Graphics Technology not previously disabled by the BIOS
+#     or motherboard settings
+#
+EXIT_FAILURE=1
+UBUNTU_VERSION=
+DISTRO=
+
+
+params=$@
+yes_or_no() {
+    if [ "$params" == "-y" ]; then
+        return 1
+    fi
+
+    while true; do
+        read -p "Do you want to continue: " yn
+        case $yn in
+            [Yy]*) return 1 ;;
+            [Nn]*) return 0 ;;
+        esac
+    done
+}
+
+
+_install_prerequisites_centos()
+{
+    # yum doesn't accept timeout in seconds as parameter
+    echo
+    echo "Note: if yum becomes non-responsive, try aborting the script and run:"
+    echo "      sudo -E $0"
+    echo
+
+    CMDS=("yum -y install tar libpciaccess numactl-libs"
+          "yum -y groupinstall 'Development Tools'"
+          "yum -y install rpmdevtools openssl openssl-devel bc numactl ocl-icd ocl-icd-devel")
+
+    for cmd in "${CMDS[@]}"; do
+        echo $cmd
+        eval $cmd
+        if [[ $? -ne 0 ]]; then
+            echo ERROR: failed to run $cmd >&2
+            echo Problem \(or disk space\)? >&2
+            echo . Verify that you have enough disk space, and run the script again. >&2
+            exit $EXIT_FAILURE
+        fi
+    done
+
+}
+
+_install_prerequisites_ubuntu()
+{
+    CMDS=("apt-get -y update"
+          "apt-get -y install libnuma1 ocl-icd-libopencl1")
+
+    for cmd in "${CMDS[@]}"; do
+        echo $cmd
+        eval $cmd
+        if [[ $? -ne 0 ]]; then
+            echo ERROR: failed to run $cmd >&2
+            echo Problem \(or disk space\)? >&2
+            echo "                sudo -E $0" >&2
+            echo 2. Verify that you have enough disk space, and run the script again. >&2
+            exit $EXIT_FAILURE
+        fi
+    done
+}
+
+install_prerequisites()
+{
+    if [[ $DISTRO == "centos" ]]; then
+        echo Installing prerequisites...
+        _install_prerequisites_centos
+    elif [[ $DISTRO == "ubuntu" ]]; then
+        echo Installing prerequisites...
+        _install_prerequisites_ubuntu
+    else
+        echo Unknown OS
+    fi
+}
+
+_deploy_rpm()
+{
+    # On a CentOS 7.2 machine with Intel Parallel Composer XE 2017
+    # installed we got conflicts when trying to deploy these rpms.
+    # If that happens to you too, try again with:
+    # IGFX_RPM_FLAGS="--force" sudo -E ./install_NEO_OCL_driver.sh install
+    #
+    cmd="rpm $IGFX_RPM_FLAGS -ivh --nodeps --force $1"
+    echo $cmd
+    eval $cmd
+}
+
+_deploy_deb()
+{
+    cmd="dpkg -i $1"
+    echo $cmd
+    eval $cmd
+}
+
+_install_user_mode_centos()
+{
+    _deploy_rpm "intel*.rpm"
+    if [[ $? -ne 0 ]]; then
+        echo ERROR: failed to install rpms $cmd error  >&2
+        echo Make sure you have enough disk space or fix the problem manually and try again. >&2
+        exit $EXIT_FAILURE
+    fi
+}
+
+_install_user_mode_ubuntu()
+{
+    _deploy_deb "intel*.deb"
+    if [[ $? -ne 0 ]]; then
+        echo ERROR: failed to install rpms $cmd error  >&2
+        echo Make sure you have enough disk space or fix the problem manually and try again. >&2
+        exit $EXIT_FAILURE
+    fi
+}
+
+install_user_mode()
+{
+    echo Installing user mode driver...
+
+    if [[ $DISTRO == "centos" ]]; then
+        _install_user_mode_centos
+    else
+        _install_user_mode_ubuntu
+    fi
+
+}
+
+_uninstall_user_mode_centos()
+{
+    echo Looking for previously installed user-mode driver...
+    PACKAGES=("intel-opencl"
+           "intel-ocloc"
+           "intel-gmmlib"
+           "intel-igc-core"
+           "intel-igc-opencl")
+    for package in "${PACKAGES[@]}"; do      
+        echo "rpm -qa | grep $package"
+        found_package=$(rpm -qa | grep $package)
+        if [[ $? -eq 0 ]]; then
+            echo Found installed user-mode driver, performing uninstall...
+            cmd="rpm -e --nodeps ${found_package}"
+            echo $cmd
+            eval $cmd
+            if [[ $? -ne 0 ]]; then
+                echo ERROR: failed to uninstall existing user-mode driver. >&2
+                echo Please try again manually and run the script again. >&2
+                exit $EXIT_FAILURE
+            fi
+        fi
+    done
+}
+
+_uninstall_user_mode_ubuntu()
+{
+    echo Looking for previously installed user-mode driver...
+
+    PACKAGES=("intel-opencl"
+           "intel-ocloc"
+           "intel-gmmlib"
+           "intel-igc-core"
+           "intel-igc-opencl")
+
+    for package in "${PACKAGES[@]}"; do
+        found_package=$(dpkg-query -W -f='${binary:Package}\n' ${package})
+        if [[ $? -eq 0 ]]; then
+            echo Found $found_package installed, uninstalling...
+            dpkg --purge $found_package
+            if [[ $? -ne 0 ]]; then
+                echo "ERROR: unable to remove $found_package" >&2
+                echo "       please resolve it manually and try to launch the script again." >&2
+                exit $EXIT_FAILURE
+            fi
+        fi
+    done
+}
+
+uninstall_user_mode()
+{
+    if [[ $DISTRO == "centos" ]]; then
+        _uninstall_user_mode_centos
+    else
+        _uninstall_user_mode_ubuntu
+    fi
+}
+
+version_gt() {
+    # check if first version is greater than second version
+    test "$(printf '%s\n' "$@" | sort -V | head -n 1)" != "$1";
+}
+
+summary()
+{
+    kernel_version=$(uname -r)
+
+    echo
+    echo Installation completed successfully.
+    echo
+    echo Next steps:
+    echo "Add OpenCL users to the video group: 'sudo usermod -a -G video USERNAME'"
+    echo "   e.g. if the user running OpenCL host applications is foo, run: sudo usermod -a -G video foo"
+    echo "   Current user has been already added to the video group"
+    echo
+
+    # ask to install kernel 4.14 if current kernel version < 4.13 (GPU NEO driver supports only kernels 4.13.x and higher)
+    if version_gt "4.13" "$kernel_version" ; then
+        echo "Install 4.14 kernel using install_4_14_kernel.sh script and reboot into this kernel"
+        echo
+    fi
+
+    echo "If you use 8th Generation Intel® Coreâ„¢ processor, you will need to add:"
+    echo "   i915.alpha_support=1"
+    echo "   to the 4.14 kernel command line, in order to enable OpenCL functionality for this platform."
+    echo
+}
+
+check_root_access()
+{
+    if [[ $EUID -ne 0 ]]; then
+        echo "ERROR: you must run this script as root." >&2
+        echo "Please try again with "sudo -E $0", or as root." >&2
+        exit $EXIT_FAILURE
+    fi
+}
+
+add_user_to_video_group()
+{
+    local real_user=$(logname 2>/dev/null || echo ${SUDO_USER:-${USER}})
+    echo
+    echo Adding $real_user to the video group...
+    usermod -a -G video $real_user
+    if [[ $? -ne 0 ]]; then
+        echo WARNING: unable to add $real_user to the video group >&2
+    fi
+}
+
+_check_distro_version()
+{
+    if [[ $DISTRO == centos ]]; then
+        if ! grep -q 'CentOS Linux release 7\.' /etc/centos-release; then
+            echo ERROR: this script is supported only on CentOS 7 >&2
+            exit $EXIT_FAILURE
+        fi
+    elif [[ $DISTRO == ubuntu ]]; then
+        grep -q -E "18.04" /etc/lsb-release && UBUNTU_VERSION="18.04"
+        if [[ -z $UBUNTU_VERSION ]]; then
+            echo "Warning: The driver was validated only on Ubuntu 18.04 LTS with stock kernel. \nMore info https://github.com/intel/compute-runtime/releases" >&2
+            if [ ! yes_or_no ]; then
+                echo "Installation of GFX driver interrupted"
+                exit $EXIT_FAILURE
+            fi
+        fi
+    fi
+}
+
+distro_init()
+{
+    if [[ -f /etc/centos-release ]]; then
+        DISTRO="centos"
+    elif [[ -f /etc/lsb-release ]]; then
+        DISTRO="ubuntu"
+    fi
+
+    _check_distro_version
+}
+
+install()
+{
+    uninstall_user_mode
+    install_prerequisites
+    install_user_mode
+    add_user_to_video_group
+}
+
+main()
+{
+    echo "Intel OpenCL graphics driver installer"
+    distro_init
+    check_root_access
+    install
+    summary
+}
+
+[[ "$0" == "$BASH_SOURCE" ]] && main "$@"
diff --git a/scripts/install_dependencies/install_guide.html b/scripts/install_dependencies/install_guide.html
new file mode 100644 (file)
index 0000000..5ddb7c0
--- /dev/null
@@ -0,0 +1 @@
+<meta http-equiv="REFRESH" content="0;URL=http://docs.openvinotoolkit.org/2019_R1/_docs_install_guides_installing_openvino_linux.html#set-the-environment-variables">
\ No newline at end of file
diff --git a/scripts/install_dependencies/install_openvino_dependencies.sh b/scripts/install_dependencies/install_openvino_dependencies.sh
new file mode 100644 (file)
index 0000000..38e855a
--- /dev/null
@@ -0,0 +1,351 @@
+#!/bin/bash
+
+# Copyright (c) 2018 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+if [ $EUID -ne 0 ]; then
+    echo "ERROR: this script must be run as root to install 3rd party packages." >&2
+    echo "Please try again with \"sudo -E $0\", or as root." >&2
+    exit 1
+fi
+
+params=$@
+
+yes_or_no_ffmpeg() {
+    if [ "$params" == "-y" ]; then
+        return 0
+    fi
+
+    while true; do
+        read -p "Add third-party RPM Fusion repository and install FFmpeg package (y/n): " yn
+        case $yn in
+            [Yy]*) return 0  ;;
+            [Nn]*) return  1 ;;
+        esac
+    done
+}
+
+yes_or_no_gst_bad_ugly() {
+    if [ "$params" == "-y" ]; then
+        return 0
+    fi
+
+    while true; do
+        read -p "Add third-party RPM Epel, Nux, Fusion, Forensics repositories and install dependencies for GStreamer Bad & Ugly Plugins (y/n): " yn
+        case $yn in
+            [Yy]*) return 0  ;;
+            [Nn]*) return  1 ;;
+        esac
+    done
+}
+
+if [ -f /etc/lsb-release ]; then
+    # Ubuntu
+    echo
+    echo "This script installs the following OpenVINO 3rd-party dependencies:"
+    echo "  1. GTK+, FFmpeg and GStreamer libraries used by OpenCV"
+    echo "  2. libusb library required for Myriad plugin for Inference Engine"
+    echo "  3. build dependencies for OpenVINO samples"
+    echo "  4. build dependencies for GStreamer Plugins"
+    echo
+    PKGS=(
+        cpio
+        build-essential
+        cmake
+        libusb-1.0-0-dev
+        libdrm-dev
+        libgstreamer1.0-0
+        gstreamer1.0-plugins-base
+        gstreamer1.0-plugins-good
+        gstreamer1.0-plugins-bad
+        ffmpeg
+    )
+    system_ver=$(cat /etc/lsb-release | grep -i "DISTRIB_RELEASE" | cut -d "=" -f2)
+    if [ "$system_ver" = "16.04" ]; then
+        PKGS+=( libgtk2.0-0 )
+    else
+        PKGS+=( libgtk-3-0
+                libglib2.0-0
+                flex
+                bison
+                libgmp10
+                libgsl23
+                gobject-introspection
+                libcap2
+                libcap2-bin
+                gettext
+                libgirepository-1.0-1
+                libx11-6
+                iso-codes
+                libgl1-mesa-dri
+                libgles2
+                libgl-dev
+                gudev-1.0
+                libtheora0
+                libcdparanoia0
+                libpango-1.0-0
+                libgbm1
+                libasound2
+                libjpeg8
+                libvisual-0.4-0
+                libxv1
+                libopus0
+                libgraphene-1.0-0
+                libvorbis0a
+                libbz2-1.0
+                libv4l-0
+                libaa1
+                libflac8
+                libgdk-pixbuf2.0-0
+                libmp3lame0
+                libcaca0
+                libdv4
+                libmpg123-0
+                libraw1394-11
+                libavc1394-0
+                libiec61883-0
+                libpulse0
+                libsoup2.4-1
+                libspeex1
+                libtag-extras1
+                libtwolame0
+                libwavpack1
+                libbluetooth3
+                libusb-1.0.0-dev
+                libass9
+                libbs2b0
+                libchromaprint1
+                liblcms2-2
+                libssh2-1
+                libdc1394-22
+                libdirectfb-1.7-7
+                libssh-4
+                libdca0
+                libfaac0
+                libfdk-aac1
+                libflite1
+                libfluidsynth1
+                libgme0
+                libgsm1
+                libnettle6
+                libkate1
+                liblrdf0
+                libde265-0
+                libmjpegtools-dev
+                libmms0
+                libmodplug1
+                libmpcdec6
+                libneon27
+                libopenal1
+                libopenexr22
+                libopenjp2-7
+                libopenmpt0
+                libopenni2-0
+                libdvdnav4
+                librtmp1
+                librsvg2-2
+                libsbc1
+                libsndfile1
+                libsoundtouch1
+                libspandsp2
+                libsrtp2-1
+                libzvbi0
+                libvo-aacenc0
+                libvo-amrwbenc0
+                libwebrtc-audio-processing1
+                libwebp6
+                libwildmidi2
+                libzbar0
+                libnice10
+                libxkbcommon0
+                libmpeg2-4
+                libopencore-amrnb0
+                libopencore-amrwb0
+                liba52-0.7.4
+                libva2
+                libxrandr2
+                libudev1
+                python3.6
+                libpython3.6
+                python3-gi
+        )
+    fi
+    apt update
+    apt install -y ${PKGS[@]}
+else
+    # CentOS
+    echo
+    echo "This script installs the following OpenVINO 3rd-party dependencies:"
+    echo "  1. GTK+ and GStreamer libraries used by OpenCV"
+    echo "  2. libusb library required for Myriad plugin for Inference Engine"
+    echo "  3. Python 3.6 for Model Optimizer"
+    echo "  4. gcc 4.8.5 and other build dependencies for OpenVINO samples"
+    echo "  5. build dependencies for GStreamer Plugins"
+    echo
+    PKGS=(
+        libusbx-devel
+        gtk2
+        gstreamer1
+        gstreamer1-plugins-good
+        gstreamer1-plugins-bad-free
+        gcc
+        gcc-c++
+        make
+        glibc-static
+        glibc
+        libstdc++-static
+        libstdc++
+        libstdc++
+        libgcc
+        cmake
+        python36
+        python36-pip
+        glib2-devel
+        flex
+        bison
+        gmp
+        gsl
+        gobject-introspection
+        libcap
+        libcap
+        gettext
+        libXrandr
+        libX11
+        iso-codes
+        mesa-libEGL
+        mesa-libGLES
+        mesa-libGL
+        libgudev1
+        libtheora
+        cdparanoia
+        pango
+        mesa-libgbm
+        alsa-lib
+        libjpeg-turbo
+        libvisual
+        libXv
+        opus
+        libvorbis
+        patch
+        bzip2
+        libv4l
+        flac
+        gdk-pixbuf2
+        libdv
+        mpg123
+        libraw1394
+        libavc1394
+        libiec61883
+        pulseaudio-libs
+        libsoup
+        speex
+        wavpack
+        boost-regex-1.53.0
+    )
+    yum install -y ${PKGS[@]}
+
+    # Thirdparty repositories for installing GStreamer Bad & Ugly Plugins dependencies.
+    if yes_or_no_gst_bad_ugly; then
+        GST_BAD_UGLY_PKGS=(
+            bluez-libs
+            libusb
+            libass
+            libbs2b
+            libchromaprint
+            lcms2
+            libssh2
+            libdc1394
+            libXext
+            libssh
+            libdca
+            faac
+            fdk-aac
+            flite
+            fluidsynth
+            game-music-emu
+            gsm
+            nettle
+            kate
+            liblrdf
+            libde265
+            mjpegtools
+            libmms
+            libmodplug
+            libmpcdec
+            neon
+            openal-soft
+            OpenEXR
+            openjpeg2
+            openni
+            libdvdnav
+            librtmp
+            librsvg2
+            sbc
+            libsndfile
+            soundtouch
+            spandsp
+            libsrtp
+            zvbi
+            vo-amrwbenc
+            webrtc-audio-processing
+            wildmidi
+            zbar
+            libnice
+            libxkbcommon
+            libmpeg2
+            libcdio
+            opencore-amr
+            libva
+            python36-gobject
+            python3-devel
+        )
+        yum install -y epel-release
+        rpm -Uvh https://download1.rpmfusion.org/free/el/rpmfusion-free-release-7.noarch.rpm
+        RPMFUSION_IS_INSTALLED=1
+        yum install -y https://forensics.cert.org/cert-forensics-tools-release-el7.rpm
+        yum install -y ${GST_BAD_UGLY_PKGS[@]}
+    else
+        echo "Dependencies for GStreamer Ugly & Bad plugins installation skipped."
+        echo
+    fi
+
+    echo
+    echo "Intel(R) Distribution of OpenVINO(TM) toolkit can use FFmpeg for processing video streams with OpenCV. Please select your preferred method for installing FFmpeg:"
+    echo
+    echo "Option 1: Allow installer script to add a third party repository, RPM Fusion (https://rpmfusion.org/), which contains FFmpeg. FFmpeg rpm package will be installed from this repository. "
+    echo "WARNING: This repository is NOT PROVIDED OR SUPPORTED by Intel or CentOS. Neither Intel nor CentOS has control over this repository. Terms governing your use of FFmpeg can be found here: https://www.ffmpeg.org/legal.html "
+    echo "Once added, this repository will be enabled on your operating system and can thus receive updates to all packages installed from it. "
+    echo
+    echo "Consider the following ways to prevent unintended 'updates' from this third party repository from over-writing some core part of CentOS:"
+    echo "a) Only enable these archives from time to time, and generally leave them disabled. See: man yum"
+    echo "b) Use the exclude= and includepkgs= options on a per sub-archive basis, in the matching .conf file found in /etc/yum.repos.d/ See: man yum.conf"
+    echo "c) The yum Priorities plug-in can prevent a 3rd party repository from replacing base packages, or prevent base/updates from replacing a 3rd party package."
+    echo
+    echo "Option 2: Skip FFmpeg installation."
+    echo
+
+    if yes_or_no_ffmpeg; then
+        if [[ -z $RPMFUSION_IS_INSTALLED ]]; then
+            yum install -y epel-release
+            rpm -Uvh https://download1.rpmfusion.org/free/el/rpmfusion-free-release-7.noarch.rpm
+        fi
+        yum install -y ffmpeg
+    else
+        echo "FFmpeg installation skipped. You may build FFmpeg from sources as described here: https://trac.ffmpeg.org/wiki/CompilationGuide/Centos"
+        echo
+    fi
+    exit
+fi
diff --git a/scripts/setupvars/setupvars.bat b/scripts/setupvars/setupvars.bat
new file mode 100644 (file)
index 0000000..1f08c14
--- /dev/null
@@ -0,0 +1,111 @@
+@echo off
+
+:: Copyright (c) 2018-2019 Intel Corporation
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::      http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+
+set ROOT=%~dp0
+call :GetFullPath "%ROOT%\.." ROOT
+set SCRIPT_NAME=%~nx0
+
+set "INTEL_OPENVINO_DIR=%ROOT%"
+set "INTEL_CVSDK_DIR=%INTEL_OPENVINO_DIR%"
+
+where /q libmmd.dll || echo Warning: libmmd.dll couldn't be found in %%PATH%%. Please check if the redistributable package for Intel(R) C++ Compiler is installed and the library path is added to the PATH environment variable. System reboot can be required to update the system environment.
+
+:: OpenCV
+if exist "%INTEL_OPENVINO_DIR%\opencv\setupvars.bat" (
+call "%INTEL_OPENVINO_DIR%\opencv\setupvars.bat"
+) else (
+set "OpenCV_DIR=%INTEL_OPENVINO_DIR%\opencv\x64\vc14\lib"
+set "PATH=%INTEL_OPENVINO_DIR%\opencv\x64\vc14\bin;%PATH%"
+)
+
+:: Model Optimizer
+if exist %INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer (
+set PYTHONPATH=%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer;%PYTHONPATH%
+set "PATH=%INTEL_OPENVINO_DIR%\deployment_tools\model_optimizer;%PATH%"
+)
+
+:: Inference Engine
+set "InferenceEngine_DIR=%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\share"
+set "HDDL_INSTALL_DIR=%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\hddl"
+set "PATH=%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\external\tbb\bin;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Release;%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\Debug;%HDDL_INSTALL_DIR%\bin;%PATH%"
+if exist %INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\arch_descriptions (
+set ARCH_ROOT_DIR=%INTEL_OPENVINO_DIR%\deployment_tools\inference_engine\bin\intel64\arch_descriptions
+)
+
+:: nGraph
+if exist %INTEL_OPENVINO_DIR%\deployment_tools\ngraph (
+set "PATH=%INTEL_OPENVINO_DIR%\deployment_tools\ngraph\lib;%PATH%"
+set "ngraph_DIR=%INTEL_OPENVINO_DIR%\deployment_tools\ngraph\cmake"
+)
+
+:: Check if Python is installed
+python --version 2>NUL
+if errorlevel 1 (
+   echo Error^: Python is not installed. Please install Python 3.5. or 3.6  ^(64-bit^) from https://www.python.org/downloads/
+   exit /B 1
+)
+
+:: Check Python version
+for /F "tokens=* USEBACKQ" %%F IN (`python --version 2^>^&1`) DO (
+   set version=%%F
+)
+
+for /F "tokens=1,2,3 delims=. " %%a in ("%version%") do (
+   set Major=%%b
+   set Minor=%%c
+)
+
+if "%Major%" geq "3" (
+   if "%Minor%" geq "5" (
+      set python_ver=okay
+   )
+)
+
+if not "%python_ver%"=="okay" (
+   echo Unsupported Python version. Please install Python 3.5 or 3.6  ^(64-bit^) from https://www.python.org/downloads/
+   exit /B 1
+)
+
+:: Check Python bitness
+python -c "import sys; print(64 if sys.maxsize > 2**32 else 32)" 2 > NUL
+if errorlevel 1 (
+   echo Error^: Error during installed Python bitness detection
+   exit /B 1
+)
+
+for /F "tokens=* USEBACKQ" %%F IN (`python -c "import sys; print(64 if sys.maxsize > 2**32 else 32)" 2^>^&1`) DO (
+   set bitness=%%F
+)
+
+if not "%bitness%"=="64" (
+   echo Unsupported Python bitness. Please install Python 3.5 or 3.6  ^(64-bit^) from https://www.python.org/downloads/
+   exit /B 1
+)
+
+set PYTHONPATH=%INTEL_OPENVINO_DIR%\python\python%Major%.%Minor%;%INTEL_OPENVINO_DIR%\python\python3;%PYTHONPATH%
+
+if exist %INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\accuracy_checker (
+    set PYTHONPATH=%INTEL_OPENVINO_DIR%\deployment_tools\open_model_zoo\tools\accuracy_checker;%PYTHONPATH%
+)
+
+echo [setupvars.bat] OpenVINO environment initialized
+
+exit /B 0
+
+:GetFullPath
+SET %2=%~f1
+
+GOTO :EOF
diff --git a/scripts/setupvars/setupvars.sh b/scripts/setupvars/setupvars.sh
new file mode 100644 (file)
index 0000000..3ce0d70
--- /dev/null
@@ -0,0 +1,132 @@
+#!/bin/bash
+
+# Copyright (c) 2018-2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INSTALLDIR="${INTEL_OPENVINO_DIR:-<INSTALLDIR>}"
+if [[ ! -d "${INSTALLDIR}" ]]; then
+  # Script has not been processed by installer, so INSTALLDIR is not valid
+  # Using autodetection assuming:
+  # - current shell is "bash"
+  # - location of the current script is in "INSTALLDIR/bin"
+  SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+  BASE_DIR="$( dirname "$SCRIPT_DIR" )"
+
+  INSTALLDIR="${BASE_DIR}"
+fi
+
+export INTEL_OPENVINO_DIR="$INSTALLDIR"
+export INTEL_CVSDK_DIR="$INTEL_OPENVINO_DIR"
+
+# parse command line options
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    -pyver)
+    python_version=$2
+    echo python_version = "${python_version}"
+    shift
+    ;;
+    *)
+    # unknown option
+    ;;
+esac
+shift
+done
+
+if [ -e $INSTALLDIR/deployment_tools/inference_engine ]; then
+    export InferenceEngine_DIR=$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/share
+    system_type=$(\ls $INTEL_OPENVINO_DIR/deployment_tools/inference_engine/lib/)
+    IE_PLUGINS_PATH=$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/lib/$system_type
+
+    if [[ -e ${IE_PLUGINS_PATH}/arch_descriptions ]]; then
+        export ARCH_ROOT_DIR=${IE_PLUGINS_PATH}/arch_descriptions
+    fi
+
+    export HDDL_INSTALL_DIR=$INSTALLDIR/deployment_tools/inference_engine/external/hddl
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        export DYLD_LIBRARY_PATH=$INSTALLDIR/deployment_tools/inference_engine/external/mkltiny_mac/lib:$INSTALLDIR/deployment_tools/inference_engine/external/tbb/lib:$IE_PLUGINS_PATH:$DYLD_LIBRARY_PATH
+        export LD_LIBRARY_PATH=$INSTALLDIR/deployment_tools/inference_engine/external/mkltiny_mac/lib:$INSTALLDIR/deployment_tools/inference_engine/external/tbb/lib:$IE_PLUGINS_PATH:$LD_LIBRARY_PATH
+    else
+        export LD_LIBRARY_PATH=$HDDL_INSTALL_DIR/lib:$INSTALLDIR/deployment_tools/inference_engine/external/gna/lib:$INSTALLDIR/deployment_tools/inference_engine/external/mkltiny_lnx/lib:$INSTALLDIR/deployment_tools/inference_engine/external/tbb/lib:$IE_PLUGINS_PATH:$LD_LIBRARY_PATH
+    fi
+fi
+
+if [ -e $INSTALLDIR/deployment_tools/ngraph ]; then
+    export LD_LIBRARY_PATH=$INSTALLDIR/deployment_tools/ngraph/lib:$LD_LIBRARY_PATH
+    export ngraph_DIR=$INSTALLDIR/deployment_tools/ngraph/cmake
+fi
+    
+if [ -e "$INSTALLDIR/opencv" ]; then
+    if [ -f "$INSTALLDIR/opencv/setupvars.sh" ]; then
+        source "$INSTALLDIR/opencv/setupvars.sh"
+    else
+        export OpenCV_DIR="$INSTALLDIR/opencv/share/OpenCV"
+        export LD_LIBRARY_PATH="$INSTALLDIR/opencv/lib:$LD_LIBRARY_PATH"
+        export LD_LIBRARY_PATH="$INSTALLDIR/opencv/share/OpenCV/3rdparty/lib:$LD_LIBRARY_PATH"
+    fi
+fi
+
+
+if [ -f "$INTEL_OPENVINO_DIR/data_processing/dl_streamer/bin/setupvars.sh" ]; then
+    source "$INTEL_OPENVINO_DIR/data_processing/dl_streamer/bin/setupvars.sh"
+fi
+
+export PATH="$INTEL_OPENVINO_DIR/deployment_tools/model_optimizer:$PATH"
+export PYTHONPATH="$INTEL_OPENVINO_DIR/deployment_tools/model_optimizer:$PYTHONPATH"
+
+if [ -e $INTEL_OPENVINO_DIR/deployment_tools/open_model_zoo/tools/accuracy_checker ]; then
+    export PYTHONPATH="$INTEL_OPENVINO_DIR/deployment_tools/open_model_zoo/tools/accuracy_checker:$PYTHONPATH"
+fi
+
+if [ -z "$python_version" ]; then
+    if command -v python3.7 >/dev/null 2>&1; then
+        python_version=3.7
+        python_bitness=$(python3.7 -c 'import sys; print(64 if sys.maxsize > 2**32 else 32)')
+    elif command -v python3.6 >/dev/null 2>&1; then
+        python_version=3.6
+        python_bitness=$(python3.6 -c 'import sys; print(64 if sys.maxsize > 2**32 else 32)')
+    elif command -v python3.5 >/dev/null 2>&1; then
+        python_version=3.5
+        python_bitness=$(python3.5 -c 'import sys; print(64 if sys.maxsize > 2**32 else 32)')
+    elif command -v python3.4 >/dev/null 2>&1; then
+        python_version=3.4
+        python_bitness=$(python3.4 -c 'import sys; print(64 if sys.maxsize > 2**32 else 32)')
+    elif command -v python2.7 >/dev/null 2>&1; then
+        python_version=2.7
+    elif command -v python >/dev/null 2>&1; then
+        python_version=$(python -c 'import sys; print(".".join(map(str, sys.version_info[:2])))')
+    fi
+fi
+
+OS_NAME=""
+if command -v lsb_release >/dev/null 2>&1; then
+    OS_NAME=$(lsb_release -i -s)
+fi
+
+if [ "$python_bitness" != "" ] && [ "$python_bitness" != "64" ] && [ "$OS_NAME" != "Raspbian" ]; then
+    echo "[setupvars.sh] 64 bitness for Python" $python_version "is requred"
+fi
+
+if [ ! -z "$python_version" ]; then
+    if [ "$python_version" != "2.7" ]; then
+        # add path to OpenCV API for Python 3.x
+        export PYTHONPATH="$INTEL_OPENVINO_DIR/python/python3:$PYTHONPATH"
+    fi
+    # add path to Inference Engine Python API
+    export PYTHONPATH="$INTEL_OPENVINO_DIR/python/python$python_version:$PYTHONPATH"
+fi
+
+echo "[setupvars.sh] OpenVINO environment initialized"
diff --git a/scripts/utils/create_package.py b/scripts/utils/create_package.py
new file mode 100644 (file)
index 0000000..6eb53ad
--- /dev/null
@@ -0,0 +1,15 @@
+import argparse
+import os
+from shutil import rmtree
+
+from utils import Automation
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--build_number", type=int, help="Build number to be added to package version", default=0, )
+args = parser.parse_args()
+
+auto = Automation()
+base_dir = os.path.dirname(__file__)
+bom_path = os.path.join(base_dir, "package_BOM.txt")
+bom = auto.parse_bom(bom_path=bom_path)
+dir_to_tar = auto.copy_files_from_bom(root_path=os.path.join(os.path.dirname(__file__), ".."), bom=bom)
diff --git a/scripts/utils/utils.py b/scripts/utils/utils.py
new file mode 100644 (file)
index 0000000..7d33e9e
--- /dev/null
@@ -0,0 +1,56 @@
+import os
+import subprocess
+import tarfile
+from datetime import datetime
+from shutil import copyfile, copytree, rmtree
+
+major_version = 0
+minor_version = 3
+
+
+class Automation:
+    @staticmethod
+    def parse_bom(bom_path):
+        files = []
+        for file in open(bom_path):
+            files.append(file)
+        return files
+
+    @staticmethod
+    def copy_files_from_bom(root_path, bom):
+        target_dir = os.path.join(os.path.dirname(__file__), "tools_package")
+        if os.path.exists(target_dir):
+            rmtree(target_dir)
+        os.makedirs(target_dir)
+        for file in bom:
+            src = os.path.join(root_path, file.strip('\n'))
+            dst = os.path.join(target_dir, file.strip('\n'))
+            if not os.path.exists(os.path.dirname(dst)):
+                os.makedirs(os.path.dirname(dst))
+            if os.path.isdir(src):
+                copytree(src, dst)
+            else:
+                copyfile(src, dst)
+        return target_dir
+
+    @staticmethod
+    def add_version_txt(dst_path, build_number, git_hash_short):
+        git_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip("\n")
+        if git_hash_short == "0":
+            git_hash_short = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("utf-8").strip(
+                "\n")
+        verson = "{0}.{1}.{2}.{3}".format(major_version, minor_version, build_number, git_hash_short)
+        timestamp = datetime.now().strftime("%I:%M%p %B %d, %Y")
+        with open(os.path.join(dst_path, "version.txt"), 'w') as f:
+            f.write(timestamp + '\n')
+            f.write(verson + '\n')
+            f.write(git_hash + '\n')
+        return verson
+
+    @staticmethod
+    def make_tarfile(out_file_name, source_dir):
+        archive_path = os.path.join(os.path.dirname(__file__), out_file_name)
+        if os.path.exists(archive_path):
+            os.remove(archive_path)
+        with tarfile.open(out_file_name, "w:gz") as tar:
+            tar.add(source_dir, arcname=os.path.basename(source_dir))
diff --git a/tests/stress_tests/.automation/env_config.xml b/tests/stress_tests/.automation/env_config.xml
new file mode 100644 (file)
index 0000000..7d356d0
--- /dev/null
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<attributes>
+    <irs_path>
+        <value>/nfs/inn/proj/vdp/vdp_tests/stress_tests/master_04d6f112132f92cab563ae7655747e0359687dc9/</value>
+    </irs_path>
+</attributes>
diff --git a/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_references_config.xml b/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_references_config.xml
new file mode 100644 (file)
index 0000000..82a6c6c
--- /dev/null
@@ -0,0 +1,533 @@
+<?xml version="1.0"?>
+<attributes>
+    <models>
+        <model path="caffe/FP32/alexnet/alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753847" vmpeak="1528832" vmrss="14005" vmhwm="814655" />
+        <model path="caffe/FP32/alexnet/alexnet.xml" test="create_exenetwork" device="GPU" vmsize="580025" vmpeak="1743759" vmrss="234704" vmhwm="1462062" />
+        <model path="caffe/FP32/alexnet/alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1339971" vmpeak="1528828" vmrss="555262" vmhwm="814805" />
+        <model path="caffe/FP32/alexnet/alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1389159" vmpeak="1741154" vmrss="1036169" vmhwm="1460052" />
+        <model path="caffe/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="CPU" vmsize="753843" vmpeak="1545451" vmrss="14234" vmhwm="821334" />
+        <model path="caffe/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="GPU" vmsize="602206" vmpeak="1511325" vmrss="257501" vmhwm="1230284" />
+        <model path="caffe/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="CPU" vmsize="1368206" vmpeak="1545456" vmrss="576774" vmhwm="821739" />
+        <model path="caffe/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="GPU" vmsize="1423096" vmpeak="1511373" vmrss="1074752" vmhwm="1230732" />
+        <model path="caffe/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772626" vmpeak="985754" vmrss="95260" vmhwm="151496" />
+        <model path="caffe/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1044604" vmpeak="1154709" vmrss="699168" vmhwm="811104" />
+        <model path="caffe/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985525" vmpeak="1057614" vmrss="159306" vmhwm="159306" />
+        <model path="caffe/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1163289" vmpeak="1235379" vmrss="812961" vmhwm="812961" />
+        <model path="caffe/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="CPU" vmsize="762770" vmpeak="1212248" vmrss="93570" vmhwm="426817" />
+        <model path="caffe/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="GPU" vmsize="1127847" vmpeak="1586310" vmrss="782029" vmhwm="1304679" />
+        <model path="caffe/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="CPU" vmsize="1351816" vmpeak="1423906" vmrss="353738" vmhwm="427644" />
+        <model path="caffe/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="GPU" vmsize="1660304" vmpeak="1660304" vmrss="1309215" vmhwm="1309215" />
+        <model path="caffe/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="791863" vmpeak="998329" vmrss="123059" vmhwm="240160" />
+        <model path="caffe/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1309598" vmpeak="1428944" vmrss="964066" vmhwm="1086751" />
+        <model path="caffe/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1060303" vmpeak="1132392" vmrss="238924" vmhwm="240416" />
+        <model path="caffe/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1435214" vmpeak="1507303" vmrss="1084969" vmhwm="1084969" />
+        <model path="caffe/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="CPU" vmsize="864639" vmpeak="1153900" vmrss="147906" vmhwm="322590" />
+        <model path="caffe/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="GPU" vmsize="1541161" vmpeak="1686282" vmrss="1195972" vmhwm="1337595" />
+        <model path="caffe/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="CPU" vmsize="1181479" vmpeak="1253568" vmrss="315581" vmhwm="322700" />
+        <model path="caffe/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="GPU" vmsize="1706760" vmpeak="1778849" vmrss="1356533" vmhwm="1356533" />
+        <model path="caffe/FP32/dilation/dilation.xml" test="create_exenetwork" device="CPU" vmsize="754428" vmpeak="3004311" vmrss="17613" vmhwm="1856210" />
+        <model path="caffe/FP32/dilation/dilation.xml" test="create_exenetwork" device="GPU" vmsize="710569" vmpeak="3363879" vmrss="365380" vmhwm="3081751" />
+        <model path="caffe/FP32/dilation/dilation.xml" test="infer_request_inference" device="CPU" vmsize="2487130" vmpeak="3004311" vmrss="1687936" vmhwm="1856448" />
+        <model path="caffe/FP32/dilation/dilation.xml" test="infer_request_inference" device="GPU" vmsize="2951748" vmpeak="3363804" vmrss="2597940" vmhwm="3080968" />
+        <model path="caffe/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="CPU" vmsize="767157" vmpeak="1369376" vmrss="63338" vmhwm="540166" />
+        <model path="caffe/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="GPU" vmsize="1155101" vmpeak="1701180" vmrss="809938" vmhwm="1420152" />
+        <model path="caffe/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="CPU" vmsize="1299262" vmpeak="1373882" vmrss="431758" vmhwm="540214" />
+        <model path="caffe/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="GPU" vmsize="1647738" vmpeak="1719828" vmrss="1296350" vmhwm="1419092" />
+        <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="1642832" vmrss="14014" vmhwm="789109" />
+        <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="create_exenetwork" device="GPU" vmsize="595430" vmpeak="1690484" vmrss="250496" vmhwm="1409205" />
+        <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1494464" vmpeak="1642832" vmrss="679214" vmhwm="789412" />
+        <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1450746" vmpeak="1693172" vmrss="1097681" vmhwm="1412254" />
+        <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="919740" vmpeak="1521955" vmrss="234520" vmhwm="792022" />
+        <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1666363" vmpeak="2175012" vmrss="1321245" vmhwm="1893936" />
+        <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1436982" vmpeak="1521955" vmrss="643614" vmhwm="793218" />
+        <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2138818" vmpeak="2210907" vmrss="1786162" vmhwm="1893760" />
+        <model path="caffe/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="CPU" vmsize="757262" vmpeak="978832" vmrss="81408" vmhwm="124238" />
+        <model path="caffe/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="GPU" vmsize="810590" vmpeak="929139" vmrss="464868" vmhwm="503813" />
+        <model path="caffe/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="CPU" vmsize="928637" vmpeak="1000727" vmrss="130719" vmhwm="130719" />
+        <model path="caffe/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="GPU" vmsize="859478" vmpeak="931568" vmrss="507540" vmhwm="507540" />
+        <model path="caffe/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="CPU" vmsize="766726" vmpeak="925245" vmrss="33382" vmhwm="180268" />
+        <model path="caffe/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="GPU" vmsize="775117" vmpeak="913347" vmrss="430157" vmhwm="605598" />
+        <model path="caffe/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="CPU" vmsize="927163" vmpeak="999253" vmrss="141869" vmhwm="181156" />
+        <model path="caffe/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="GPU" vmsize="924752" vmpeak="996842" vmrss="571590" vmhwm="602839" />
+        <model path="caffe/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="767003" vmpeak="1090526" vmrss="34900" vmhwm="348172" />
+        <model path="caffe/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="948046" vmpeak="1182082" vmrss="602624" vmhwm="900169" />
+        <model path="caffe/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="1051481" vmpeak="1123570" vmrss="257219" vmhwm="348541" />
+        <model path="caffe/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="1187106" vmpeak="1259196" vmrss="834438" vmhwm="902800" />
+        <model path="caffe/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764315" vmpeak="1326938" vmrss="63725" vmhwm="603213" />
+        <model path="caffe/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1183410" vmpeak="1680448" vmrss="837953" vmhwm="1398870" />
+        <model path="caffe/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1227798" vmpeak="1326908" vmrss="438160" vmhwm="602434" />
+        <model path="caffe/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1633997" vmpeak="1706086" vmrss="1281693" vmhwm="1395878" />
+        <model path="caffe/FP32/lenet/lenet.xml" test="create_exenetwork" device="CPU" vmsize="753605" vmpeak="876330" vmrss="15571" vmhwm="29106" />
+        <model path="caffe/FP32/lenet/lenet.xml" test="create_exenetwork" device="GPU" vmsize="566693" vmpeak="658486" vmrss="220783" vmhwm="232452" />
+        <model path="caffe/FP32/lenet/lenet.xml" test="infer_request_inference" device="CPU" vmsize="808486" vmpeak="880576" vmrss="29084" vmhwm="29084" />
+        <model path="caffe/FP32/lenet/lenet.xml" test="infer_request_inference" device="GPU" vmsize="586401" vmpeak="658490" vmrss="232764" vmhwm="232764" />
+        <model path="caffe/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="754864" vmpeak="893692" vmrss="54617" vmhwm="81584" />
+        <model path="caffe/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="642527" vmpeak="750424" vmrss="296678" vmhwm="362300" />
+        <model path="caffe/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="831336" vmpeak="903425" vmrss="85654" vmhwm="85654" />
+        <model path="caffe/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="716047" vmpeak="788136" vmrss="364434" vmhwm="364434" />
+        <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="CPU" vmsize="756813" vmpeak="819698" vmrss="54410" vmhwm="78289" />
+        <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="GPU" vmsize="758705" vmpeak="862466" vmrss="412966" vmhwm="437131" />
+        <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="CPU" vmsize="840967" vmpeak="840967" vmrss="82860" vmhwm="82860" />
+        <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="GPU" vmsize="787182" vmpeak="859271" vmrss="436801" vmhwm="436801" />
+        <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="CPU" vmsize="753715" vmpeak="876299" vmrss="17512" vmhwm="28402" />
+        <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="GPU" vmsize="583092" vmpeak="674744" vmrss="238220" vmhwm="249722" />
+        <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="CPU" vmsize="808209" vmpeak="808209" vmrss="27865" vmhwm="27865" />
+        <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="GPU" vmsize="600714" vmpeak="672804" vmrss="246967" vmhwm="246967" />
+        <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="CPU" vmsize="763677" vmpeak="874535" vmrss="13318" vmhwm="35327" />
+        <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="GPU" vmsize="570521" vmpeak="662182" vmrss="224774" vmhwm="351410" />
+        <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="CPU" vmsize="901260" vmpeak="973350" vmrss="108037" vmhwm="108037" />
+        <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="GPU" vmsize="685115" vmpeak="757204" vmrss="331421" vmhwm="351529" />
+        <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="803228" vmrss="14806" vmhwm="25911" />
+        <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="GPU" vmsize="577280" vmpeak="667673" vmrss="232029" vmhwm="242580" />
+        <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="CPU" vmsize="806102" vmpeak="806102" vmrss="25352" vmhwm="25352" />
+        <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="GPU" vmsize="593340" vmpeak="665429" vmrss="240200" vmhwm="240200" />
+        <model path="caffe/FP32/openpose_face/openpose_face.xml" test="create_exenetwork" device="CPU" vmsize="764711" vmpeak="1279238" vmrss="23544" vmhwm="528431" />
+        <model path="caffe/FP32/openpose_face/openpose_face.xml" test="create_exenetwork" device="GPU" vmsize="890428" vmpeak="1316884" vmrss="544882" vmhwm="1035192" />
+        <model path="caffe/FP32/openpose_face/openpose_face.xml" test="infer_request_inference" device="CPU" vmsize="1187529" vmpeak="1279207" vmrss="398512" vmhwm="528730" />
+        <model path="caffe/FP32/openpose_face/openpose_face.xml" test="infer_request_inference" device="GPU" vmsize="1288707" vmpeak="1360796" vmrss="935778" vmhwm="1038888" />
+        <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="create_exenetwork" device="CPU" vmsize="755634" vmpeak="1259024" vmrss="23342" vmhwm="507980" />
+        <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="create_exenetwork" device="GPU" vmsize="845886" vmpeak="1297898" vmrss="500957" vmhwm="1016822" />
+        <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="infer_request_inference" device="CPU" vmsize="1327246" vmpeak="1327246" vmrss="384634" vmhwm="507522" />
+        <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="infer_request_inference" device="GPU" vmsize="1277117" vmpeak="1300490" vmrss="923674" vmhwm="1018956" />
+        <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="create_exenetwork" device="CPU" vmsize="757556" vmpeak="1471373" vmrss="32780" vmhwm="716861" />
+        <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="create_exenetwork" device="GPU" vmsize="1153103" vmpeak="1684306" vmrss="807426" vmhwm="1402513" />
+        <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="infer_request_inference" device="CPU" vmsize="1397686" vmpeak="1471373" vmrss="528620" vmhwm="717728" />
+        <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="infer_request_inference" device="GPU" vmsize="1597785" vmpeak="1680465" vmrss="1244672" vmhwm="1399217" />
+        <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="1485853" vmrss="14330" vmhwm="773766" />
+        <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="create_exenetwork" device="GPU" vmsize="604573" vmpeak="1684861" vmrss="259556" vmhwm="1403600" />
+        <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1311107" vmpeak="1485862" vmrss="528448" vmhwm="773656" />
+        <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1346840" vmpeak="1684896" vmrss="993942" vmhwm="1403886" />
+        <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="create_exenetwork" device="CPU" vmsize="757187" vmpeak="831362" vmrss="78795" vmhwm="113814" />
+        <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="create_exenetwork" device="GPU" vmsize="805270" vmpeak="920321" vmrss="460319" vmhwm="495638" />
+        <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="infer_request_inference" device="CPU" vmsize="852781" vmpeak="852781" vmrss="119033" vmhwm="119033" />
+        <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="infer_request_inference" device="GPU" vmsize="847052" vmpeak="919142" vmrss="494916" vmhwm="494916" />
+        <model path="caffe/FP32/resnet_18/resnet_18.xml" test="create_exenetwork" device="CPU" vmsize="754248" vmpeak="925443" vmrss="16878" vmhwm="177663" />
+        <model path="caffe/FP32/resnet_18/resnet_18.xml" test="create_exenetwork" device="GPU" vmsize="657659" vmpeak="799510" vmrss="312070" vmhwm="466153" />
+        <model path="caffe/FP32/resnet_18/resnet_18.xml" test="infer_request_inference" device="CPU" vmsize="920163" vmpeak="920163" vmrss="131859" vmhwm="176726" />
+        <model path="caffe/FP32/resnet_18/resnet_18.xml" test="infer_request_inference" device="GPU" vmsize="775350" vmpeak="847440" vmrss="422919" vmhwm="467610" />
+        <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760584" vmpeak="1338202" vmrss="43243" vmhwm="616928" />
+        <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1104862" vmpeak="1557006" vmrss="759030" vmhwm="1275071" />
+        <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1224172" vmpeak="1338172" vmrss="434944" vmhwm="616849" />
+        <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1452145" vmpeak="1558106" vmrss="1099428" vmhwm="1276787" />
+        <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="764878" vmpeak="1551919" vmrss="58638" vmhwm="828383" />
+        <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1315120" vmpeak="1977250" vmrss="968858" vmhwm="1694796" />
+        <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1526166" vmpeak="1598256" vmrss="582401" vmhwm="829598" />
+        <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1804748" vmpeak="1975855" vmrss="1451397" vmhwm="1693419" />
+        <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="create_exenetwork" device="CPU" vmsize="927665" vmpeak="2236845" vmrss="224034" vmhwm="1396458" />
+        <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="create_exenetwork" device="GPU" vmsize="1988676" vmpeak="3156291" vmrss="1643919" vmhwm="2874946" />
+        <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="infer_request_inference" device="CPU" vmsize="2016999" vmpeak="2236955" vmrss="1117754" vmhwm="1396128" />
+        <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="infer_request_inference" device="GPU" vmsize="2845849" vmpeak="3165219" vmrss="2493550" vmhwm="2883091" />
+        <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="CPU" vmsize="766101" vmpeak="1079971" vmrss="27359" vmhwm="362142" />
+        <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="GPU" vmsize="834856" vmpeak="1080094" vmrss="490089" vmhwm="799312" />
+        <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="CPU" vmsize="1046381" vmpeak="1118471" vmrss="260528" vmhwm="362203" />
+        <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="GPU" vmsize="1060109" vmpeak="1132199" vmrss="707876" vmhwm="804108" />
+        <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="create_exenetwork" device="CPU" vmsize="758516" vmpeak="930397" vmrss="40572" vmhwm="194062" />
+        <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="create_exenetwork" device="GPU" vmsize="873061" vmpeak="1013430" vmrss="528167" vmhwm="692564" />
+        <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="infer_request_inference" device="CPU" vmsize="957620" vmpeak="1029710" vmrss="152754" vmhwm="194656" />
+        <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="infer_request_inference" device="GPU" vmsize="1014305" vmpeak="1086395" vmrss="662525" vmhwm="694821" />
+        <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="create_exenetwork" device="CPU" vmsize="759382" vmpeak="1174707" vmrss="39265" vmhwm="401856" />
+        <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="create_exenetwork" device="GPU" vmsize="983083" vmpeak="1257471" vmrss="637335" vmhwm="975444" />
+        <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="infer_request_inference" device="CPU" vmsize="1140730" vmpeak="1174672" vmrss="315977" vmhwm="401508" />
+        <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="infer_request_inference" device="GPU" vmsize="1251214" vmpeak="1323304" vmrss="899034" vmhwm="976474" />
+        <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="create_exenetwork" device="CPU" vmsize="754890" vmpeak="815095" vmrss="28833" vmhwm="43881" />
+        <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="create_exenetwork" device="GPU" vmsize="651974" vmpeak="746719" vmrss="306455" vmhwm="321345" />
+        <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="infer_request_inference" device="CPU" vmsize="824942" vmpeak="897032" vmrss="48567" vmhwm="48567" />
+        <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="infer_request_inference" device="GPU" vmsize="676328" vmpeak="748418" vmrss="324860" vmhwm="324860" />
+        <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="758212" vmpeak="813208" vmrss="29691" vmhwm="44220" />
+        <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="611789" vmpeak="706534" vmrss="266244" vmhwm="324007" />
+        <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818549" vmpeak="890639" vmrss="47141" vmhwm="47141" />
+        <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="677705" vmpeak="749795" vmrss="326163" vmhwm="326163" />
+        <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="create_exenetwork" device="CPU" vmsize="757534" vmpeak="911495" vmrss="36445" vmhwm="182050" />
+        <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="create_exenetwork" device="GPU" vmsize="835683" vmpeak="973280" vmrss="490613" vmhwm="658640" />
+        <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="infer_request_inference" device="CPU" vmsize="941076" vmpeak="1013166" vmrss="148222" vmhwm="183185" />
+        <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="infer_request_inference" device="GPU" vmsize="989608" vmpeak="1061698" vmrss="637709" vmhwm="661746" />
+        <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="757174" vmpeak="901648" vmrss="73409" vmhwm="106537" />
+        <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="801644" vmpeak="915186" vmrss="456517" vmhwm="490520" />
+        <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="847932" vmpeak="847932" vmrss="116410" vmhwm="116410" />
+        <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="843022" vmpeak="915112" vmrss="490864" vmhwm="490864" />
+        <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="create_exenetwork" device="CPU" vmsize="765393" vmpeak="900402" vmrss="71544" vmhwm="105032" />
+        <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="create_exenetwork" device="GPU" vmsize="759668" vmpeak="872762" vmrss="414493" vmhwm="497701" />
+        <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="infer_request_inference" device="CPU" vmsize="848438" vmpeak="900754" vmrss="113590" vmhwm="113590" />
+        <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="infer_request_inference" device="GPU" vmsize="847620" vmpeak="919710" vmrss="495730" vmhwm="495730" />
+        <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="CPU" vmsize="755374" vmpeak="1146156" vmrss="22026" vmhwm="370176" />
+        <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="GPU" vmsize="768451" vmpeak="1074730" vmrss="423662" vmhwm="794266" />
+        <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="CPU" vmsize="1113609" vmpeak="1185698" vmrss="313513" vmhwm="370035" />
+        <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="GPU" vmsize="1134227" vmpeak="1206317" vmrss="783006" vmhwm="795000" />
+        <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="create_exenetwork" device="CPU" vmsize="755796" vmpeak="1267802" vmrss="23746" vmhwm="383983" />
+        <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="create_exenetwork" device="GPU" vmsize="794565" vmpeak="1272634" vmrss="449394" vmhwm="991632" />
+        <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="infer_request_inference" device="CPU" vmsize="1234050" vmpeak="1306140" vmrss="421194" vmhwm="421194" />
+        <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="infer_request_inference" device="GPU" vmsize="1348960" vmpeak="1421050" vmrss="999050" vmhwm="999050" />
+        <model path="caffe/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754006" vmpeak="2548497" vmrss="15598" vmhwm="1808624" />
+        <model path="caffe/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="668602" vmpeak="3326708" vmrss="323791" vmhwm="3045328" />
+        <model path="caffe/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2027181" vmpeak="2548497" vmrss="1242560" vmhwm="1808730" />
+        <model path="caffe/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2441076" vmpeak="3326708" vmrss="2088055" vmhwm="3045050" />
+        <model path="caffe/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754212" vmpeak="2618030" vmrss="15510" vmhwm="1877383" />
+        <model path="caffe/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="739222" vmpeak="3397112" vmrss="393866" vmhwm="3115085" />
+        <model path="caffe/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2073794" vmpeak="2618030" vmrss="1289741" vmhwm="1878289" />
+        <model path="caffe/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2518340" vmpeak="3397081" vmrss="2165196" vmhwm="3114975" />
+        <model path="caffe/FP32/vnect/vnect.xml" test="create_exenetwork" device="CPU" vmsize="764940" vmpeak="947157" vmrss="27988" vmhwm="223726" />
+        <model path="caffe/FP32/vnect/vnect.xml" test="create_exenetwork" device="GPU" vmsize="789223" vmpeak="941683" vmrss="443788" vmhwm="641476" />
+        <model path="caffe/FP32/vnect/vnect.xml" test="infer_request_inference" device="CPU" vmsize="962187" vmpeak="1034277" vmrss="177848" vmhwm="224180" />
+        <model path="caffe/FP32/vnect/vnect.xml" test="infer_request_inference" device="GPU" vmsize="969069" vmpeak="1041158" vmrss="616990" vmhwm="641977" />
+        <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="create_exenetwork" device="CPU" vmsize="755651" vmpeak="1654985" vmrss="24921" vmhwm="920400" />
+        <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="create_exenetwork" device="GPU" vmsize="936892" vmpeak="1838610" vmrss="590994" vmhwm="1556526" />
+        <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="infer_request_inference" device="CPU" vmsize="1433352" vmpeak="1654989" vmrss="639456" vmhwm="918693" />
+        <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="infer_request_inference" device="GPU" vmsize="1613176" vmpeak="1824922" vmrss="1259940" vmhwm="1543031" />
+        <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="CPU" vmsize="754692" vmpeak="4259393" vmrss="18013" vmhwm="3532412" />
+        <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="GPU" vmsize="719105" vmpeak="5906194" vmrss="373648" vmhwm="5623600" />
+        <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="CPU" vmsize="3167040" vmpeak="4259380" vmrss="2378362" vmhwm="3531237" />
+        <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="GPU" vmsize="4165801" vmpeak="5903801" vmrss="3812393" vmhwm="5621585" />
+        <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753860" vmpeak="1101161" vmrss="14599" vmhwm="375399" />
+        <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="GPU" vmsize="577640" vmpeak="1037480" vmrss="232443" vmhwm="755972" />
+        <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1059828" vmpeak="1131917" vmrss="272879" vmhwm="374721" />
+        <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="GPU" vmsize="957453" vmpeak="1037445" vmrss="605026" vmhwm="756606" />
+        <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1422647" vmrss="16790" vmhwm="680072" />
+        <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="GPU" vmsize="678964" vmpeak="1435790" vmrss="334017" vmhwm="1154573" />
+        <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="CPU" vmsize="1279823" vmpeak="1422647" vmrss="490692" vmhwm="680526" />
+        <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="GPU" vmsize="1325156" vmpeak="1438571" vmrss="972140" vmhwm="1157138" />
+        <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753733" vmpeak="954430" vmrss="14278" vmhwm="229913" />
+        <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="create_exenetwork" device="GPU" vmsize="568880" vmpeak="814976" vmrss="223907" vmhwm="533808" />
+        <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1032882" vmpeak="1032882" vmrss="174631" vmhwm="230243" />
+        <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="infer_request_inference" device="GPU" vmsize="810031" vmpeak="816178" vmrss="456856" vmhwm="534503" />
+        <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="CPU" vmsize="756852" vmpeak="1587154" vmrss="31460" vmhwm="837570" />
+        <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="GPU" vmsize="1159840" vmpeak="1822444" vmrss="813969" vmhwm="1540343" />
+        <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="CPU" vmsize="1554462" vmpeak="1626552" vmrss="609677" vmhwm="836655" />
+        <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="GPU" vmsize="1735610" vmpeak="1821749" vmrss="1383285" vmhwm="1540598" />
+        <model path="mxnet/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="CPU" vmsize="753856" vmpeak="1528538" vmrss="14414" vmhwm="815491" />
+        <model path="mxnet/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="GPU" vmsize="580030" vmpeak="1741062" vmrss="235624" vmhwm="1460386" />
+        <model path="mxnet/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="CPU" vmsize="1339681" vmpeak="1528538" vmrss="556146" vmhwm="815262" />
+        <model path="mxnet/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="GPU" vmsize="1389097" vmpeak="1741093" vmrss="1036178" vmhwm="1460060" />
+        <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772622" vmpeak="985749" vmrss="95431" vmhwm="151087" />
+        <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1141962" vmpeak="1252068" vmrss="796734" vmhwm="827217" />
+        <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985239" vmpeak="1057328" vmrss="158532" vmhwm="158532" />
+        <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1171425" vmpeak="1243514" vmrss="818624" vmhwm="818624" />
+        <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="CPU" vmsize="762731" vmpeak="1211720" vmrss="93486" vmhwm="426896" />
+        <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="GPU" vmsize="1312801" vmpeak="1592839" vmrss="967252" vmhwm="1311569" />
+        <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="CPU" vmsize="1198124" vmpeak="1270214" vmrss="353051" vmhwm="427319" />
+        <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="GPU" vmsize="1657339" vmpeak="1729428" vmrss="1304820" vmhwm="1304820" />
+        <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="796360" vmpeak="1002408" vmrss="123094" vmhwm="239945" />
+        <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1352916" vmpeak="1472262" vmrss="1007630" vmhwm="1084727" />
+        <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1059880" vmpeak="1059880" vmrss="239307" vmhwm="241753" />
+        <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1437656" vmpeak="1509745" vmrss="1084828" vmhwm="1084828" />
+        <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="CPU" vmsize="864635" vmpeak="1154040" vmrss="148830" vmhwm="322528" />
+        <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="GPU" vmsize="1505042" vmpeak="1650162" vmrss="1159906" vmhwm="1343711" />
+        <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="CPU" vmsize="1181056" vmpeak="1253146" vmrss="315048" vmhwm="322282" />
+        <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="GPU" vmsize="1719256" vmpeak="1791345" vmrss="1366767" vmhwm="1366767" />
+        <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="CPU" vmsize="767976" vmpeak="1370195" vmrss="63456" vmhwm="539897" />
+        <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="GPU" vmsize="1313452" vmpeak="1701664" vmrss="968145" vmhwm="1420434" />
+        <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="CPU" vmsize="1295571" vmpeak="1370195" vmrss="430610" vmhwm="539536" />
+        <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="GPU" vmsize="1651421" vmpeak="1723510" vmrss="1299738" vmhwm="1422326" />
+        <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754212" vmpeak="3124338" vmrss="17362" vmhwm="1770388" />
+        <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="create_exenetwork" device="GPU" vmsize="669583" vmpeak="3628222" vmrss="324363" vmhwm="3347071" />
+        <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2705824" vmpeak="3124338" vmrss="1906933" vmhwm="1906933" />
+        <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="infer_request_inference" device="GPU" vmsize="3710449" vmpeak="3782539" vmrss="3356861" vmhwm="3356861" />
+        <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="create_exenetwork" device="CPU" vmsize="756870" vmpeak="1192276" vmrss="32300" vmhwm="470417" />
+        <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="create_exenetwork" device="GPU" vmsize="772970" vmpeak="1363872" vmrss="428054" vmhwm="1079412" />
+        <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="infer_request_inference" device="CPU" vmsize="1123746" vmpeak="1195836" vmrss="335288" vmhwm="470162" />
+        <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="infer_request_inference" device="GPU" vmsize="1219618" vmpeak="1362376" vmrss="875415" vmhwm="1077560" />
+        <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="848157" vmpeak="1522730" vmrss="178424" vmhwm="792470" />
+        <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1549574" vmpeak="2182501" vmrss="1203804" vmhwm="1900742" />
+        <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1437730" vmpeak="1522730" vmrss="644402" vmhwm="794024" />
+        <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2145426" vmpeak="2217516" vmrss="1793162" vmhwm="1899854" />
+        <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="756584" vmpeak="925636" vmrss="32982" vmhwm="182529" />
+        <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="769230" vmpeak="907847" vmrss="423874" vmhwm="604982" />
+        <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="928659" vmpeak="928659" vmrss="142304" vmhwm="182353" />
+        <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="926103" vmpeak="998192" vmrss="572985" vmhwm="603592" />
+        <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="create_exenetwork" device="CPU" vmsize="757851" vmpeak="1078682" vmrss="34751" vmhwm="348154" />
+        <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="create_exenetwork" device="GPU" vmsize="911473" vmpeak="1183102" vmrss="565549" vmhwm="900992" />
+        <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="infer_request_inference" device="CPU" vmsize="1051652" vmpeak="1123742" vmrss="258231" vmhwm="349131" />
+        <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="infer_request_inference" device="GPU" vmsize="1182570" vmpeak="1254660" vmrss="829659" vmhwm="899540" />
+        <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764319" vmpeak="1327506" vmrss="61375" vmhwm="601048" />
+        <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1206559" vmpeak="1676272" vmrss="860362" vmhwm="1393906" />
+        <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1228396" vmpeak="1327475" vmrss="441135" vmhwm="603394" />
+        <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1637486" vmpeak="1709576" vmrss="1285376" vmhwm="1398377" />
+        <model path="mxnet/FP32/location_net/location_net.xml" test="create_exenetwork" device="CPU" vmsize="761046" vmpeak="1754029" vmrss="43916" vmhwm="1002368" />
+        <model path="mxnet/FP32/location_net/location_net.xml" test="create_exenetwork" device="GPU" vmsize="1026110" vmpeak="2108686" vmrss="680191" vmhwm="1826792" />
+        <model path="mxnet/FP32/location_net/location_net.xml" test="infer_request_inference" device="CPU" vmsize="1512095" vmpeak="1753998" vmrss="701483" vmhwm="1002333" />
+        <model path="mxnet/FP32/location_net/location_net.xml" test="infer_request_inference" device="GPU" vmsize="1880973" vmpeak="2110306" vmrss="1532348" vmhwm="1828952" />
+        <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="create_exenetwork" device="CPU" vmsize="759695" vmpeak="1636430" vmrss="38011" vmhwm="883225" />
+        <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="create_exenetwork" device="GPU" vmsize="1118880" vmpeak="1994964" vmrss="773102" vmhwm="1713034" />
+        <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="infer_request_inference" device="CPU" vmsize="1430871" vmpeak="1636434" vmrss="617078" vmhwm="882886" />
+        <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="infer_request_inference" device="GPU" vmsize="1804484" vmpeak="1993530" vmrss="1450724" vmhwm="1711340" />
+        <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="754872" vmpeak="821893" vmrss="55070" vmhwm="82354" />
+        <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="626304" vmpeak="734201" vmrss="280918" vmhwm="362925" />
+        <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="831344" vmpeak="903434" vmrss="86495" vmhwm="86495" />
+        <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="718357" vmpeak="790446" vmrss="367096" vmhwm="367096" />
+        <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="CPU" vmsize="756826" vmpeak="819711" vmrss="53961" vmhwm="77206" />
+        <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="GPU" vmsize="758023" vmpeak="861784" vmrss="412702" vmhwm="436805" />
+        <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="CPU" vmsize="836470" vmpeak="891765" vmrss="83050" vmhwm="83050" />
+        <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="GPU" vmsize="788986" vmpeak="861075" vmrss="437646" vmhwm="437646" />
+        <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="CPU" vmsize="762731" vmpeak="804491" vmrss="17490" vmhwm="28454" />
+        <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="GPU" vmsize="578894" vmpeak="670546" vmrss="233547" vmhwm="245172" />
+        <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="CPU" vmsize="808209" vmpeak="808209" vmrss="28314" vmhwm="28314" />
+        <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="GPU" vmsize="600507" vmpeak="672597" vmrss="247596" vmhwm="247596" />
+        <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="CPU" vmsize="753530" vmpeak="881588" vmrss="13208" vmhwm="35261" />
+        <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="GPU" vmsize="570042" vmpeak="661702" vmrss="224870" vmhwm="353003" />
+        <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="CPU" vmsize="901260" vmpeak="901260" vmrss="107390" vmhwm="107390" />
+        <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="GPU" vmsize="686408" vmpeak="758498" vmrss="332895" vmhwm="351907" />
+        <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="803228" vmrss="14546" vmhwm="25586" />
+        <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="GPU" vmsize="577288" vmpeak="667682" vmrss="231642" vmhwm="242167" />
+        <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="CPU" vmsize="806102" vmpeak="806102" vmrss="24468" vmhwm="24468" />
+        <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="GPU" vmsize="595588" vmpeak="667678" vmrss="242246" vmhwm="242246" />
+        <model path="mxnet/FP32/nin/nin.xml" test="create_exenetwork" device="CPU" vmsize="753838" vmpeak="907420" vmrss="80674" vmhwm="122086" />
+        <model path="mxnet/FP32/nin/nin.xml" test="create_exenetwork" device="GPU" vmsize="675633" vmpeak="798283" vmrss="330184" vmhwm="372754" />
+        <model path="mxnet/FP32/nin/nin.xml" test="infer_request_inference" device="CPU" vmsize="841390" vmpeak="913479" vmrss="123776" vmhwm="123776" />
+        <model path="mxnet/FP32/nin/nin.xml" test="infer_request_inference" device="GPU" vmsize="726066" vmpeak="798155" vmrss="390764" vmhwm="390764" />
+        <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754080" vmpeak="884950" vmrss="35930" vmhwm="56368" />
+        <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="create_exenetwork" device="GPU" vmsize="613082" vmpeak="713020" vmrss="267753" vmhwm="358019" />
+        <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="infer_request_inference" device="CPU" vmsize="847726" vmpeak="919815" vmrss="83300" vmhwm="83300" />
+        <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="infer_request_inference" device="GPU" vmsize="710754" vmpeak="782843" vmrss="357442" vmhwm="357442" />
+        <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760821" vmpeak="1370292" vmrss="44242" vmhwm="618965" />
+        <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1077643" vmpeak="1594964" vmrss="731733" vmhwm="1313127" />
+        <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1256200" vmpeak="1370261" vmrss="444043" vmhwm="617852" />
+        <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1494732" vmpeak="1596218" vmrss="1141690" vmhwm="1314187" />
+        <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="765322" vmpeak="1593790" vmrss="61120" vmhwm="831661" />
+        <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1339184" vmpeak="2040148" vmrss="993968" vmhwm="1758746" />
+        <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1414652" vmpeak="1593754" vmrss="594426" vmhwm="832220" />
+        <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1871271" vmpeak="2037904" vmrss="1518501" vmhwm="1756343" />
+        <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="CPU" vmsize="760650" vmpeak="1369557" vmrss="43384" vmhwm="618015" />
+        <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="GPU" vmsize="1022863" vmpeak="1592206" vmrss="676698" vmhwm="1309880" />
+        <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="CPU" vmsize="1255557" vmpeak="1369522" vmrss="445350" vmhwm="618750" />
+        <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="GPU" vmsize="1490077" vmpeak="1591563" vmrss="1137444" vmhwm="1309910" />
+        <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="CPU" vmsize="765204" vmpeak="1593108" vmrss="61124" vmhwm="831353" />
+        <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="GPU" vmsize="1340754" vmpeak="2034586" vmrss="995636" vmhwm="1753100" />
+        <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="CPU" vmsize="1413992" vmpeak="1593077" vmrss="592710" vmhwm="831098" />
+        <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="GPU" vmsize="1867096" vmpeak="2036610" vmrss="1514532" vmhwm="1755089" />
+        <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="create_exenetwork" device="CPU" vmsize="766911" vmpeak="1356080" vmrss="64389" vmhwm="623026" />
+        <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="create_exenetwork" device="GPU" vmsize="1105068" vmpeak="1552320" vmrss="759990" vmhwm="1271340" />
+        <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="infer_request_inference" device="CPU" vmsize="1258699" vmpeak="1356084" vmrss="468780" vmhwm="623788" />
+        <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="infer_request_inference" device="GPU" vmsize="1478730" vmpeak="1553591" vmrss="1126364" vmhwm="1272167" />
+        <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="create_exenetwork" device="CPU" vmsize="761239" vmpeak="1894468" vmrss="40691" vmhwm="1139410" />
+        <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="create_exenetwork" device="GPU" vmsize="1418938" vmpeak="2248351" vmrss="1073886" vmhwm="1967262" />
+        <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="infer_request_inference" device="CPU" vmsize="1618592" vmpeak="1894499" vmrss="810946" vmhwm="1140422" />
+        <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="infer_request_inference" device="GPU" vmsize="1996112" vmpeak="2247322" vmrss="1660700" vmhwm="1965405" />
+        <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="754987" vmpeak="880664" vmrss="29475" vmhwm="43832" />
+        <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="616360" vmpeak="711106" vmrss="270859" vmhwm="322498" />
+        <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818562" vmpeak="818562" vmrss="47141" vmhwm="47141" />
+        <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="674124" vmpeak="746213" vmrss="322731" vmhwm="322731" />
+        <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="CPU" vmsize="755224" vmpeak="1146433" vmrss="21806" vmhwm="370044" />
+        <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="GPU" vmsize="775324" vmpeak="1077709" vmrss="430342" vmhwm="796857" />
+        <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="CPU" vmsize="1113904" vmpeak="1185993" vmrss="312527" vmhwm="370946" />
+        <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="GPU" vmsize="1137391" vmpeak="1137391" vmrss="785391" vmhwm="793201" />
+        <model path="mxnet/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754133" vmpeak="2548906" vmrss="14955" vmhwm="1807044" />
+        <model path="mxnet/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="668619" vmpeak="3326725" vmrss="322691" vmhwm="3044404" />
+        <model path="mxnet/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2027476" vmpeak="2548906" vmrss="1242678" vmhwm="1808470" />
+        <model path="mxnet/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2438563" vmpeak="3326725" vmrss="2085028" vmhwm="3044505" />
+        <model path="mxnet/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754226" vmpeak="2618325" vmrss="15708" vmhwm="1877977" />
+        <model path="mxnet/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="741092" vmpeak="3397116" vmrss="396074" vmhwm="3115345" />
+        <model path="mxnet/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2074089" vmpeak="2618325" vmrss="1290049" vmhwm="1878672" />
+        <model path="mxnet/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2518436" vmpeak="3397178" vmrss="2165728" vmhwm="3115459" />
+        <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="CPU" vmsize="754701" vmpeak="4259684" vmrss="17626" vmhwm="3531853" />
+        <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="GPU" vmsize="747582" vmpeak="5921322" vmrss="402490" vmhwm="5639084" />
+        <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="CPU" vmsize="3095241" vmpeak="4259670" vmrss="2379062" vmhwm="3530652" />
+        <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="GPU" vmsize="4163667" vmpeak="5923566" vmrss="3810193" vmhwm="5640967" />
+        <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="CPU" vmsize="754023" vmpeak="1334414" vmrss="15254" vmhwm="608322" />
+        <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="GPU" vmsize="600701" vmpeak="1330978" vmrss="255912" vmhwm="1049844" />
+        <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1215838" vmpeak="1334383" vmrss="428331" vmhwm="607442" />
+        <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="GPU" vmsize="1199972" vmpeak="1330384" vmrss="847391" vmhwm="1049228" />
+        <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="create_exenetwork" device="CPU" vmsize="755387" vmpeak="1175570" vmrss="25374" vmhwm="306904" />
+        <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="create_exenetwork" device="GPU" vmsize="805222" vmpeak="1346307" vmrss="460781" vmhwm="1065873" />
+        <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="infer_request_inference" device="CPU" vmsize="1188580" vmpeak="1260670" vmrss="336036" vmhwm="336036" />
+        <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="infer_request_inference" device="GPU" vmsize="1449408" vmpeak="1521498" vmrss="1096792" vmhwm="1096792" />
+        <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="create_exenetwork" device="CPU" vmsize="756822" vmpeak="1181615" vmrss="28468" vmhwm="309716" />
+        <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="create_exenetwork" device="GPU" vmsize="819271" vmpeak="2432738" vmrss="474764" vmhwm="1101047" />
+        <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="infer_request_inference" device="CPU" vmsize="1189117" vmpeak="1261207" vmrss="333788" vmhwm="333788" />
+        <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="infer_request_inference" device="GPU" vmsize="2539222" vmpeak="2611312" vmrss="2191604" vmhwm="2191604" />
+        <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="757878" vmpeak="1077934" vmrss="35261" vmhwm="348964" />
+        <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="899610" vmpeak="1179116" vmrss="553863" vmhwm="896997" />
+        <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="1050878" vmpeak="1077876" vmrss="256506" vmhwm="347974" />
+        <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="1179239" vmpeak="1251329" vmrss="826553" vmhwm="897714" />
+        <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="760456" vmpeak="1096708" vmrss="27315" vmhwm="361944" />
+        <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="834275" vmpeak="1073569" vmrss="489086" vmhwm="792343" />
+        <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="1058622" vmpeak="1130712" vmrss="267682" vmhwm="362749" />
+        <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="1050852" vmpeak="1122941" vmrss="697576" vmhwm="791040" />
+        <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="create_exenetwork" device="CPU" vmsize="755950" vmpeak="1092203" vmrss="27640" vmhwm="362740" />
+        <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="create_exenetwork" device="GPU" vmsize="835951" vmpeak="1073516" vmrss="490674" vmhwm="792224" />
+        <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="infer_request_inference" device="CPU" vmsize="1058626" vmpeak="1130716" vmrss="266516" vmhwm="361992" />
+        <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="infer_request_inference" device="GPU" vmsize="1050218" vmpeak="1071435" vmrss="696669" vmhwm="789848" />
+        <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="754872" vmpeak="880550" vmrss="29603" vmhwm="43212" />
+        <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="648881" vmpeak="743626" vmrss="303424" vmhwm="318348" />
+        <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="818246" vmpeak="818246" vmrss="46534" vmhwm="46534" />
+        <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="674146" vmpeak="746235" vmrss="320315" vmhwm="320315" />
+        <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="create_exenetwork" device="CPU" vmsize="764755" vmpeak="2092574" vmrss="38016" vmhwm="1352450" />
+        <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="create_exenetwork" device="GPU" vmsize="1578328" vmpeak="3355976" vmrss="1233474" vmhwm="3074953" />
+        <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="infer_request_inference" device="CPU" vmsize="1802838" vmpeak="2092587" vmrss="994188" vmhwm="1352709" />
+        <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="infer_request_inference" device="GPU" vmsize="2958472" vmpeak="3352694" vmrss="2607677" vmhwm="3072185" />
+        <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="create_exenetwork" device="CPU" vmsize="765124" vmpeak="2035453" vmrss="39745" vmhwm="1292420" />
+        <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="create_exenetwork" device="GPU" vmsize="1939801" vmpeak="3261715" vmrss="1594617" vmhwm="2980577" />
+        <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="infer_request_inference" device="CPU" vmsize="1750196" vmpeak="2039945" vmrss="935774" vmhwm="1291963" />
+        <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="infer_request_inference" device="GPU" vmsize="2902235" vmpeak="3265460" vmrss="2551727" vmhwm="2984352" />
+        <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="create_exenetwork" device="CPU" vmsize="757587" vmpeak="1547678" vmrss="33004" vmhwm="718973" />
+        <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="create_exenetwork" device="GPU" vmsize="1154670" vmpeak="1678943" vmrss="809811" vmhwm="1398284" />
+        <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="infer_request_inference" device="CPU" vmsize="1553134" vmpeak="1553134" vmrss="606232" vmhwm="719791" />
+        <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="infer_request_inference" device="GPU" vmsize="1753910" vmpeak="1826000" vmrss="1400234" vmhwm="1400234" />
+        <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="create_exenetwork" device="CPU" vmsize="757160" vmpeak="867486" vmrss="41307" vmhwm="62678" />
+        <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="create_exenetwork" device="GPU" vmsize="743283" vmpeak="841055" vmrss="398604" vmhwm="537209" />
+        <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="infer_request_inference" device="CPU" vmsize="888087" vmpeak="960176" vmrss="114166" vmhwm="114166" />
+        <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="infer_request_inference" device="GPU" vmsize="894339" vmpeak="966429" vmrss="541912" vmhwm="541912" />
+        <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772728" vmpeak="951218" vmrss="95840" vmhwm="151676" />
+        <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1135195" vmpeak="1245301" vmrss="789848" vmhwm="820410" />
+        <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985450" vmpeak="1057540" vmrss="159046" vmhwm="159046" />
+        <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1171152" vmpeak="1243242" vmrss="818598" vmhwm="818598" />
+        <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="864168" vmpeak="998263" vmrss="126266" vmhwm="241604" />
+        <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1353237" vmpeak="1472583" vmrss="1007978" vmhwm="1094614" />
+        <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1060316" vmpeak="1132406" vmrss="238326" vmhwm="240724" />
+        <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1447146" vmpeak="1519236" vmrss="1094759" vmhwm="1097835" />
+        <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="create_exenetwork" device="CPU" vmsize="757156" vmpeak="826843" vmrss="69031" vmhwm="100887" />
+        <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="create_exenetwork" device="GPU" vmsize="796250" vmpeak="906813" vmrss="451171" vmhwm="482077" />
+        <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="infer_request_inference" device="CPU" vmsize="849041" vmpeak="849041" vmrss="104464" vmhwm="104464" />
+        <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="infer_request_inference" device="GPU" vmsize="833984" vmpeak="906074" vmrss="481786" vmhwm="481786" />
+        <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="create_exenetwork" device="CPU" vmsize="760786" vmpeak="1139173" vmrss="66413" vmhwm="353346" />
+        <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="create_exenetwork" device="GPU" vmsize="1055560" vmpeak="1255601" vmrss="710595" vmhwm="974815" />
+        <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="infer_request_inference" device="CPU" vmsize="1097984" vmpeak="1170074" vmrss="281050" vmhwm="352228" />
+        <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="infer_request_inference" device="GPU" vmsize="1259253" vmpeak="1331343" vmrss="906562" vmhwm="976483" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml" test="create_exenetwork" device="CPU" vmsize="920884" vmpeak="2443892" vmrss="237186" vmhwm="851215" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml" test="create_exenetwork" device="GPU" vmsize="1751376" vmpeak="4164239" vmrss="1406411" vmhwm="3883422" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml" test="create_exenetwork" device="CPU" vmsize="757323" vmpeak="986519" vmrss="35006" vmhwm="212911" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml" test="create_exenetwork" device="GPU" vmsize="862219" vmpeak="1179283" vmrss="516881" vmhwm="897930" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml" test="create_exenetwork" device="CPU" vmsize="761538" vmpeak="1491811" vmrss="45667" vmhwm="671554" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml" test="create_exenetwork" device="GPU" vmsize="1126884" vmpeak="1800550" vmrss="781739" vmhwm="1519302" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml" test="create_exenetwork" device="CPU" vmsize="766964" vmpeak="1233342" vmrss="29568" vmhwm="415509" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml" test="create_exenetwork" device="GPU" vmsize="897432" vmpeak="1347007" vmrss="553357" vmhwm="1067290" />
+        <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="create_exenetwork" device="CPU" vmsize="756562" vmpeak="1099533" vmrss="30078" vmhwm="245590" />
+        <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="create_exenetwork" device="GPU" vmsize="764170" vmpeak="1353149" vmrss="419267" vmhwm="1072244" />
+        <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="infer_request_inference" device="CPU" vmsize="1478496" vmpeak="1478496" vmrss="332820" vmhwm="332820" />
+        <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="infer_request_inference" device="GPU" vmsize="1423364" vmpeak="1495454" vmrss="1070973" vmhwm="1172441" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="create_exenetwork" device="CPU" vmsize="755092" vmpeak="815298" vmrss="28811" vmhwm="43687" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="create_exenetwork" device="GPU" vmsize="620734" vmpeak="715479" vmrss="274991" vmhwm="324935" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="infer_request_inference" device="CPU" vmsize="825268" vmpeak="825268" vmrss="48439" vmhwm="48439" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="infer_request_inference" device="GPU" vmsize="680592" vmpeak="752681" vmrss="326972" vmhwm="326972" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="765182" vmpeak="880712" vmrss="29827" vmhwm="44149" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="612620" vmpeak="707366" vmrss="266855" vmhwm="323734" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818879" vmpeak="818879" vmrss="46534" vmhwm="46534" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="681010" vmpeak="753099" vmrss="326902" vmhwm="326902" />
+        <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="848056" vmpeak="1522360" vmrss="147382" vmhwm="794481" />
+        <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1699992" vmpeak="2187231" vmrss="1354892" vmhwm="1906344" />
+        <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1437365" vmpeak="1522364" vmrss="643724" vmhwm="793755" />
+        <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2152515" vmpeak="2224604" vmrss="1800026" vmhwm="1900395" />
+        <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="CPU" vmsize="757526" vmpeak="905132" vmrss="83195" vmhwm="119653" />
+        <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="GPU" vmsize="815988" vmpeak="932663" vmrss="470742" vmhwm="507760" />
+        <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="CPU" vmsize="1007820" vmpeak="1007820" vmrss="123926" vmhwm="123926" />
+        <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="GPU" vmsize="861520" vmpeak="933609" vmrss="507870" vmhwm="507870" />
+        <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="CPU" vmsize="756756" vmpeak="925425" vmrss="34007" vmhwm="180769" />
+        <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="GPU" vmsize="824168" vmpeak="962403" vmrss="478737" vmhwm="610280" />
+        <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="CPU" vmsize="927669" vmpeak="999759" vmrss="141772" vmhwm="181966" />
+        <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="GPU" vmsize="936755" vmpeak="1008845" vmrss="583963" vmhwm="611516" />
+        <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="759013" vmpeak="1063559" vmrss="51255" vmhwm="349113" />
+        <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="925958" vmpeak="1184101" vmrss="580056" vmhwm="902325" />
+        <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="1043583" vmpeak="1115672" vmrss="263520" vmhwm="349034" />
+        <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="1189548" vmpeak="1261638" vmrss="836646" vmhwm="903676" />
+        <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764574" vmpeak="1327493" vmrss="64108" vmhwm="603842" />
+        <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1221717" vmpeak="1686643" vmrss="875617" vmhwm="1404475" />
+        <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1381556" vmpeak="1403402" vmrss="440356" vmhwm="602751" />
+        <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1641921" vmpeak="1714011" vmrss="1289340" vmhwm="1405430" />
+        <model path="tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml" test="create_exenetwork" device="CPU" vmsize="762119" vmpeak="2738828" vmrss="47203" vmhwm="947557" />
+        <model path="tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml" test="create_exenetwork" device="GPU" vmsize="1295483" vmpeak="4189812" vmrss="949788" vmhwm="3908550" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="create_exenetwork" device="CPU" vmsize="763840" vmpeak="805556" vmrss="21938" vmhwm="33264" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="create_exenetwork" device="GPU" vmsize="652572" vmpeak="744180" vmrss="306754" vmhwm="318432" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="infer_request_inference" device="CPU" vmsize="814000" vmpeak="814000" vmrss="33391" vmhwm="33391" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="infer_request_inference" device="GPU" vmsize="672144" vmpeak="744233" vmrss="319026" vmhwm="319026" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="create_exenetwork" device="CPU" vmsize="754705" vmpeak="881188" vmrss="29282" vmhwm="44836" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="create_exenetwork" device="GPU" vmsize="614209" vmpeak="709759" vmrss="268778" vmhwm="326845" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="infer_request_inference" device="CPU" vmsize="818228" vmpeak="890318" vmrss="45513" vmhwm="45513" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="infer_request_inference" device="GPU" vmsize="682484" vmpeak="754573" vmrss="328966" vmhwm="328966" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="create_exenetwork" device="CPU" vmsize="754903" vmpeak="821928" vmrss="55237" vmhwm="82768" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="create_exenetwork" device="GPU" vmsize="643887" vmpeak="751788" vmrss="298685" vmhwm="367602" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="infer_request_inference" device="CPU" vmsize="831111" vmpeak="831111" vmrss="86732" vmhwm="86732" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="infer_request_inference" device="GPU" vmsize="720979" vmpeak="793069" vmrss="367584" vmhwm="367584" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="create_exenetwork" device="CPU" vmsize="756870" vmpeak="819759" vmrss="54586" vmhwm="78570" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="create_exenetwork" device="GPU" vmsize="705724" vmpeak="809490" vmrss="360267" vmhwm="435512" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="infer_request_inference" device="CPU" vmsize="835978" vmpeak="835978" vmrss="82583" vmhwm="82583" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="infer_request_inference" device="GPU" vmsize="788902" vmpeak="860992" vmrss="435727" vmhwm="435727" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="create_exenetwork" device="CPU" vmsize="756725" vmpeak="831080" vmrss="76414" vmhwm="111914" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="create_exenetwork" device="GPU" vmsize="787058" vmpeak="902290" vmrss="441399" vmhwm="476911" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="infer_request_inference" device="CPU" vmsize="847299" vmpeak="847299" vmrss="120969" vmhwm="120969" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="infer_request_inference" device="GPU" vmsize="828920" vmpeak="901010" vmrss="475939" vmhwm="475939" />
+        <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="create_exenetwork" device="CPU" vmsize="760988" vmpeak="1018754" vmrss="14484" vmhwm="296612" />
+        <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="create_exenetwork" device="GPU" vmsize="600859" vmpeak="965967" vmrss="255569" vmhwm="685150" />
+        <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="infer_request_inference" device="CPU" vmsize="1095155" vmpeak="1167245" vmrss="304607" vmhwm="304607" />
+        <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="infer_request_inference" device="GPU" vmsize="1004577" vmpeak="1076666" vmrss="651943" vmhwm="689915" />
+        <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="create_exenetwork" device="CPU" vmsize="756096" vmpeak="1100136" vmrss="27812" vmhwm="362344" />
+        <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="create_exenetwork" device="GPU" vmsize="822830" vmpeak="1073947" vmrss="477193" vmhwm="792264" />
+        <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="infer_request_inference" device="CPU" vmsize="1060571" vmpeak="1132661" vmrss="269808" vmhwm="362771" />
+        <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="infer_request_inference" device="GPU" vmsize="1054684" vmpeak="1075272" vmrss="702310" vmhwm="794314" />
+        <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760764" vmpeak="1338383" vmrss="42706" vmhwm="617047" />
+        <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1108602" vmpeak="1561885" vmrss="762616" vmhwm="1279700" />
+        <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1279819" vmpeak="1338409" vmrss="435102" vmhwm="617865" />
+        <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1455146" vmpeak="1561388" vmrss="1101755" vmhwm="1279845" />
+        <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="765221" vmpeak="1552262" vmrss="59875" vmhwm="829250" />
+        <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1322098" vmpeak="1985359" vmrss="976223" vmhwm="1703319" />
+        <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1373006" vmpeak="1552293" vmrss="581891" vmhwm="829848" />
+        <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1814348" vmpeak="1986380" vmrss="1461099" vmhwm="1704714" />
+        <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="CPU" vmsize="766088" vmpeak="1079958" vmrss="27324" vmhwm="362155" />
+        <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="GPU" vmsize="838965" vmpeak="1085884" vmrss="493407" vmhwm="804324" />
+        <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="CPU" vmsize="1046157" vmpeak="1118246" vmrss="260515" vmhwm="362810" />
+        <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="GPU" vmsize="1057223" vmpeak="1080772" vmrss="704066" vmhwm="799440" />
+        <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="CPU" vmsize="761754" vmpeak="1365104" vmrss="45179" vmhwm="620879" />
+        <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="GPU" vmsize="1120737" vmpeak="1613546" vmrss="774637" vmhwm="1331308" />
+        <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="CPU" vmsize="1251346" vmpeak="1365135" vmrss="446415" vmhwm="620241" />
+        <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="GPU" vmsize="1515817" vmpeak="1613858" vmrss="1162572" vmhwm="1331968" />
+        <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="CPU" vmsize="839823" vmpeak="1569361" vmrss="155029" vmhwm="833157" />
+        <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="GPU" vmsize="1363960" vmpeak="2068752" vmrss="1018507" vmhwm="1787042" />
+        <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="CPU" vmsize="1476041" vmpeak="1569392" vmrss="679918" vmhwm="833914" />
+        <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="GPU" vmsize="1904799" vmpeak="2060317" vmrss="1551756" vmhwm="1778167" />
+        <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="create_exenetwork" device="CPU" vmsize="756602" vmpeak="1096774" vmrss="28393" vmhwm="363391" />
+        <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="create_exenetwork" device="GPU" vmsize="845226" vmpeak="1103374" vmrss="500051" vmhwm="821986" />
+        <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="infer_request_inference" device="CPU" vmsize="1063304" vmpeak="1135393" vmrss="271220" vmhwm="364399" />
+        <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="infer_request_inference" device="GPU" vmsize="1092159" vmpeak="1105997" vmrss="738276" vmhwm="823983" />
+        <model path="tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml" test="create_exenetwork" device="CPU" vmsize="838816" vmpeak="1561762" vmrss="116930" vmhwm="752906" />
+        <model path="tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml" test="create_exenetwork" device="GPU" vmsize="1674490" vmpeak="2318250" vmrss="1329842" vmhwm="2034986" />
+        <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="755062" vmpeak="880739" vmrss="28415" vmhwm="43480" />
+        <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="609298" vmpeak="704044" vmrss="263868" vmhwm="323488" />
+        <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="825048" vmpeak="897138" vmrss="49108" vmhwm="49108" />
+        <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="675844" vmpeak="747934" vmrss="322753" vmhwm="322753" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="create_exenetwork" device="CPU" vmsize="756804" vmpeak="978252" vmrss="70514" vmhwm="120370" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="create_exenetwork" device="GPU" vmsize="831318" vmpeak="949744" vmrss="485619" vmhwm="524550" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="infer_request_inference" device="CPU" vmsize="925689" vmpeak="997779" vmrss="130244" vmhwm="130244" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="infer_request_inference" device="GPU" vmsize="878099" vmpeak="950188" vmrss="525395" vmhwm="525395" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="create_exenetwork" device="CPU" vmsize="759435" vmpeak="1442861" vmrss="34680" vmhwm="509454" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="create_exenetwork" device="GPU" vmsize="1012906" vmpeak="1460487" vmrss="667977" vmhwm="1179833" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="infer_request_inference" device="CPU" vmsize="1368043" vmpeak="1442861" vmrss="427737" vmhwm="509533" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="infer_request_inference" device="GPU" vmsize="1542648" vmpeak="1542648" vmrss="1195304" vmhwm="1195304" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="create_exenetwork" device="CPU" vmsize="759558" vmpeak="1426185" vmrss="33862" vmhwm="507768" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="create_exenetwork" device="GPU" vmsize="1010358" vmpeak="1414454" vmrss="665451" vmhwm="1133941" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="infer_request_inference" device="CPU" vmsize="1350650" vmpeak="1426185" vmrss="421828" vmhwm="509168" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="infer_request_inference" device="GPU" vmsize="1493681" vmpeak="1565770" vmrss="1145416" vmhwm="1145416" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="create_exenetwork" device="CPU" vmsize="761433" vmpeak="985784" vmrss="41514" vmhwm="254610" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="create_exenetwork" device="GPU" vmsize="876933" vmpeak="1078919" vmrss="531814" vmhwm="798001" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="infer_request_inference" device="CPU" vmsize="1028508" vmpeak="1064698" vmrss="201212" vmhwm="254390" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="infer_request_inference" device="GPU" vmsize="1091807" vmpeak="1163896" vmrss="739525" vmhwm="798023" />
+        <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="create_exenetwork" device="CPU" vmsize="754067" vmpeak="1169247" vmrss="15686" vmhwm="429523" />
+        <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="create_exenetwork" device="GPU" vmsize="682413" vmpeak="1130109" vmrss="337194" vmhwm="848733" />
+        <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="infer_request_inference" device="CPU" vmsize="1106463" vmpeak="1178553" vmrss="321428" vmhwm="429871" />
+        <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="infer_request_inference" device="GPU" vmsize="1083904" vmpeak="1155994" vmrss="730976" vmhwm="845882" />
+        <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754010" vmpeak="2548502" vmrss="15452" vmhwm="1807863" />
+        <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="686602" vmpeak="3327385" vmrss="340982" vmhwm="3045398" />
+        <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2026776" vmpeak="2548502" vmrss="1241011" vmhwm="1808730" />
+        <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2438568" vmpeak="3312188" vmrss="2084328" vmhwm="3029980" />
+        <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754168" vmpeak="2617986" vmrss="16073" vmhwm="1877000" />
+        <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="612194" vmpeak="3415310" vmrss="266732" vmhwm="3133363" />
+        <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2145479" vmpeak="2617885" vmrss="1287272" vmhwm="1877568" />
+        <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2521367" vmpeak="3415297" vmrss="2167426" vmhwm="3133059" />
+        <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1426625" vmrss="17173" vmhwm="684173" />
+        <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="GPU" vmsize="684424" vmpeak="1460949" vmrss="339600" vmhwm="1180036" />
+        <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="CPU" vmsize="1282802" vmpeak="1426625" vmrss="493737" vmhwm="684802" />
+        <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="GPU" vmsize="1331783" vmpeak="1443006" vmrss="978560" vmhwm="1161124" />
+        <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="create_exenetwork" device="CPU" vmsize="753724" vmpeak="954421" vmrss="14414" vmhwm="229578" />
+        <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="create_exenetwork" device="GPU" vmsize="569179" vmpeak="816648" vmrss="224250" vmhwm="535449" />
+        <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="infer_request_inference" device="CPU" vmsize="960810" vmpeak="960810" vmrss="174231" vmhwm="229807" />
+        <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="infer_request_inference" device="GPU" vmsize="808627" vmpeak="880717" vmrss="455677" vmhwm="533002" />
+        <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1422647" vmrss="17437" vmhwm="680666" />
+        <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="create_exenetwork" device="GPU" vmsize="686316" vmpeak="1436296" vmrss="340586" vmhwm="1154617" />
+        <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="infer_request_inference" device="CPU" vmsize="1279797" vmpeak="1422616" vmrss="490982" vmhwm="680147" />
+        <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="infer_request_inference" device="GPU" vmsize="1330780" vmpeak="1442570" vmrss="978392" vmhwm="1161490" />
+        <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="CPU" vmsize="756958" vmpeak="1587260" vmrss="31108" vmhwm="836506" />
+        <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="GPU" vmsize="1163712" vmpeak="1824596" vmrss="819011" vmhwm="1543559" />
+        <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="CPU" vmsize="1405879" vmpeak="1591766" vmrss="610302" vmhwm="836594" />
+        <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="GPU" vmsize="1734233" vmpeak="1823470" vmrss="1381925" vmhwm="1542178" />
+        <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753975" vmpeak="895633" vmrss="15637" vmhwm="140927" />
+        <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="create_exenetwork" device="GPU" vmsize="599332" vmpeak="728939" vmrss="254029" vmhwm="412566" />
+        <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="infer_request_inference" device="CPU" vmsize="903469" vmpeak="975559" vmrss="116124" vmhwm="141182" />
+        <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="infer_request_inference" device="GPU" vmsize="741738" vmpeak="813828" vmrss="389259" vmhwm="413476" />
+    </models>
+</attributes>
\ No newline at end of file
diff --git a/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_test_config.xml
new file mode 100644 (file)
index 0000000..1a50d72
--- /dev/null
@@ -0,0 +1,156 @@
+<?xml version="1.0"?>
+<attributes>
+    <devices>
+        <value>CPU</value>
+        <value>GPU</value>
+    </devices>
+    <models>
+        <value>caffe/FP32/alexnet/alexnet.xml</value>
+        <value>caffe/FP32/caffenet/caffenet.xml</value>
+        <value>caffe/FP32/densenet_121/densenet_121.xml</value>
+        <value>caffe/FP32/densenet_161/densenet_161.xml</value>
+        <value>caffe/FP32/densenet_169/densenet_169.xml</value>
+        <value>caffe/FP32/densenet_201/densenet_201.xml</value>
+        <value>caffe/FP32/dpn_92/dpn_92.xml</value>
+        <value>caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+        <value>caffe/FP32/inception_v1/inception_v1.xml</value>
+        <value>caffe/FP32/inception_v2/inception_v2.xml</value>
+        <value>caffe/FP32/inception_v3/inception_v3.xml</value>
+        <value>caffe/FP32/inception_v4/inception_v4.xml</value>
+        <value>caffe/FP32/lenet/lenet.xml</value>
+        <value>caffe/FP32/mobilenet/mobilenet.xml</value>
+        <value>caffe/FP32/mobilenet_v2/mobilenet_v2.xml</value>
+        <value>caffe/FP32/resnet_18/resnet_18.xml</value>
+        <value>caffe/FP32/resnet_v1_50/resnet_v1_50.xml</value>
+        <value>caffe/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+        <value>caffe/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+        <value>caffe/FP32/resnet_v1_269/resnet_v1_269.xml</value>
+        <value>caffe/FP32/se_resnext_50/se_resnext_50.xml</value>
+        <value>caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml</value>
+        <value>caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+        <value>caffe/FP32/ssd_googlenet/ssd_googlenet.xml</value>
+        <value>caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml</value>
+        <value>caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml</value>
+        <value>caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml</value>
+        <value>caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml</value>
+        <value>caffe/FP32/vgg16/vgg16.xml</value>
+        <value>caffe/FP32/vgg19/vgg19.xml</value>
+        <value>caffe/FP32/wrn_50_2/wrn_50_2.xml</value>
+        <value>caffe/FP32/yolo_v1_full/yolo_v1_full.xml</value>
+        <value>caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml</value>
+        <value>caffe/FP32/yolo_v2/yolo_v2.xml</value>
+        <value>caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml</value>
+        <value>caffe/FP32/yolo_v3/yolo_v3.xml</value>
+        <value>caffe/FP32/dilation/dilation.xml</value>
+        <value>caffe/FP32/dssd/dssd.xml</value>
+        <value>caffe/FP32/fcn8/fcn8.xml</value>
+        <value>caffe/FP32/fcn32/fcn32.xml</value>
+        <value>caffe/FP32/fcn_alexnet/fcn_alexnet.xml</value>
+        <value>caffe/FP32/mtcnn_p/mtcnn_p.xml</value>
+        <value>caffe/FP32/mtcnn_r/mtcnn_r.xml</value>
+        <value>caffe/FP32/mtcnn_o/mtcnn_o.xml</value>
+        <value>caffe/FP32/openpose_face/openpose_face.xml</value>
+        <value>caffe/FP32/openpose_hand/openpose_hand.xml</value>
+        <value>caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml</value>
+        <value>caffe/FP32/places205_alexnet/places205_alexnet.xml</value>
+        <value>caffe/FP32/places205_googlenet/places205_googlenet.xml</value>
+        <value>caffe/FP32/se_bn_inception/se_bn_inception.xml</value>
+        <value>caffe/FP32/vnect/vnect.xml</value>
+        <value>tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml</value>
+        <value>tf/1.14.0/FP32/bert_xnli/bert_xnli.xml</value>
+        <value>tf/1.14.0/FP32/cmu/cmu.xml</value>
+        <value>tf/1.14.0/FP32/densenet_121/densenet_121.xml</value>
+        <value>tf/1.14.0/FP32/densenet_169/densenet_169.xml</value>
+        <value>tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml</value>
+        <value>tf/1.14.0/FP32/east/east.xml</value>
+        <value>tf/1.14.0/FP32/facenet/facenet.xml</value>
+        <value>tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml</value>
+        <value>tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml</value>
+        <value>tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml</value>
+        <value>tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml</value>
+        <value>tf/1.14.0/FP32/gnmt/gnmt.xml</value>
+        <value>tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml</value>
+        <value>tf/1.14.0/FP32/inception_v1/inception_v1.xml</value>
+        <value>tf/1.14.0/FP32/inception_v2/inception_v2.xml</value>
+        <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+        <value>tf/1.14.0/FP32/inception_v4/inception_v4.xml</value>
+        <value>tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+        <value>tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml</value>
+        <value>tf/1.14.0/FP32/ncf/ncf.xml</value>
+        <value>tf/1.14.0/FP32/nasnet-a_large/nasnet-a_large.xml</value>
+        <value>tf/1.14.0/FP32/nasnet-a_mobile/nasnet-a_mobile.xml</value>
+        <value>tf/1.14.0/FP32/pnasnet-5_large/pnasnet-5_large.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml</value>
+        <value>tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml</value>
+        <value>tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml</value>
+        <value>tf/1.14.0/FP32/unet2d/unet2d.xml</value>
+        <value>tf/1.14.0/FP32/vgg16/vgg16.xml</value>
+        <value>tf/1.14.0/FP32/vgg19/vgg19.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v2/yolo_v2.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v3/yolo_v3.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml</value>
+        <value>tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml</value>
+        <value>tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml</value>
+        <value>tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml</value>
+        <value>mxnet/FP32/caffenet/caffenet.xml</value>
+        <value>mxnet/FP32/densenet_121/densenet_121.xml</value>
+        <value>mxnet/FP32/densenet_161/densenet_161.xml</value>
+        <value>mxnet/FP32/densenet_169/densenet_169.xml</value>
+        <value>mxnet/FP32/densenet_201/densenet_201.xml</value>
+        <value>mxnet/FP32/inception_v3/inception_v3.xml</value>
+        <value>mxnet/FP32/inception_v4/inception_v4.xml</value>
+        <value>mxnet/FP32/mobilenet/mobilenet.xml</value>
+        <value>mxnet/FP32/mobilenet_v2/mobilenet_v2.xml</value>
+        <value>mxnet/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+        <value>mxnet/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+        <value>mxnet/FP32/resnet_v2_101/resnet_v2_101.xml</value>
+        <value>mxnet/FP32/resnet_v2_152/resnet_v2_152.xml</value>
+        <value>mxnet/FP32/resnext_101/resnext_101.xml</value>
+        <value>mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+        <value>mxnet/FP32/ssd_inception_v3_512/ssd_inception_v3_512.xml</value>
+        <value>mxnet/FP32/ssd_mobilenet_512/ssd_mobilenet_512.xml</value>
+        <value>mxnet/FP32/ssd_resnet50_512/ssd_resnet50_512.xml</value>
+        <value>mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml</value>
+        <value>mxnet/FP32/vgg16/vgg16.xml</value>
+        <value>mxnet/FP32/vgg19/vgg19.xml</value>
+        <value>mxnet/FP32/dpn_92/dpn_92.xml</value>
+        <value>mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml</value>
+        <value>mxnet/FP32/full_imagenet_network/full_imagenet_network.xml</value>
+        <value>mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+        <value>mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml</value>
+        <value>mxnet/FP32/location_net/location_net.xml</value>
+        <value>mxnet/FP32/lresnet100e/lresnet100e.xml</value>
+        <value>mxnet/FP32/mtcnn_p/mtcnn_p.xml</value>
+        <value>mxnet/FP32/mtcnn_r/mtcnn_r.xml</value>
+        <value>mxnet/FP32/mtcnn_o/mtcnn_o.xml</value>
+        <value>mxnet/FP32/nin/nin.xml</value>
+        <value>mxnet/FP32/nst_vgg19/nst_vgg19.xml</value>
+        <value>mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml</value>
+        <value>mxnet/FP32/yolo_v1_full/yolo_v1_full.xml</value>
+        <value>mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml</value>
+        <value>onnx/FP32/ssd_resnet34/ssd_resnet34.xml</value>
+        <value>onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml</value>
+        <value>onnx/FP32/retina_net/retina_net.xml</value>
+        <value>pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml</value>
+        <value>pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml</value>
+        <value>pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml</value>
+        <value>pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml</value>
+    </models>
+</attributes>
\ No newline at end of file
diff --git a/tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_references_config.xml b/tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_references_config.xml
new file mode 100644 (file)
index 0000000..82a6c6c
--- /dev/null
@@ -0,0 +1,533 @@
+<?xml version="1.0"?>
+<attributes>
+    <models>
+        <model path="caffe/FP32/alexnet/alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753847" vmpeak="1528832" vmrss="14005" vmhwm="814655" />
+        <model path="caffe/FP32/alexnet/alexnet.xml" test="create_exenetwork" device="GPU" vmsize="580025" vmpeak="1743759" vmrss="234704" vmhwm="1462062" />
+        <model path="caffe/FP32/alexnet/alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1339971" vmpeak="1528828" vmrss="555262" vmhwm="814805" />
+        <model path="caffe/FP32/alexnet/alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1389159" vmpeak="1741154" vmrss="1036169" vmhwm="1460052" />
+        <model path="caffe/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="CPU" vmsize="753843" vmpeak="1545451" vmrss="14234" vmhwm="821334" />
+        <model path="caffe/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="GPU" vmsize="602206" vmpeak="1511325" vmrss="257501" vmhwm="1230284" />
+        <model path="caffe/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="CPU" vmsize="1368206" vmpeak="1545456" vmrss="576774" vmhwm="821739" />
+        <model path="caffe/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="GPU" vmsize="1423096" vmpeak="1511373" vmrss="1074752" vmhwm="1230732" />
+        <model path="caffe/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772626" vmpeak="985754" vmrss="95260" vmhwm="151496" />
+        <model path="caffe/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1044604" vmpeak="1154709" vmrss="699168" vmhwm="811104" />
+        <model path="caffe/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985525" vmpeak="1057614" vmrss="159306" vmhwm="159306" />
+        <model path="caffe/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1163289" vmpeak="1235379" vmrss="812961" vmhwm="812961" />
+        <model path="caffe/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="CPU" vmsize="762770" vmpeak="1212248" vmrss="93570" vmhwm="426817" />
+        <model path="caffe/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="GPU" vmsize="1127847" vmpeak="1586310" vmrss="782029" vmhwm="1304679" />
+        <model path="caffe/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="CPU" vmsize="1351816" vmpeak="1423906" vmrss="353738" vmhwm="427644" />
+        <model path="caffe/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="GPU" vmsize="1660304" vmpeak="1660304" vmrss="1309215" vmhwm="1309215" />
+        <model path="caffe/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="791863" vmpeak="998329" vmrss="123059" vmhwm="240160" />
+        <model path="caffe/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1309598" vmpeak="1428944" vmrss="964066" vmhwm="1086751" />
+        <model path="caffe/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1060303" vmpeak="1132392" vmrss="238924" vmhwm="240416" />
+        <model path="caffe/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1435214" vmpeak="1507303" vmrss="1084969" vmhwm="1084969" />
+        <model path="caffe/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="CPU" vmsize="864639" vmpeak="1153900" vmrss="147906" vmhwm="322590" />
+        <model path="caffe/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="GPU" vmsize="1541161" vmpeak="1686282" vmrss="1195972" vmhwm="1337595" />
+        <model path="caffe/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="CPU" vmsize="1181479" vmpeak="1253568" vmrss="315581" vmhwm="322700" />
+        <model path="caffe/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="GPU" vmsize="1706760" vmpeak="1778849" vmrss="1356533" vmhwm="1356533" />
+        <model path="caffe/FP32/dilation/dilation.xml" test="create_exenetwork" device="CPU" vmsize="754428" vmpeak="3004311" vmrss="17613" vmhwm="1856210" />
+        <model path="caffe/FP32/dilation/dilation.xml" test="create_exenetwork" device="GPU" vmsize="710569" vmpeak="3363879" vmrss="365380" vmhwm="3081751" />
+        <model path="caffe/FP32/dilation/dilation.xml" test="infer_request_inference" device="CPU" vmsize="2487130" vmpeak="3004311" vmrss="1687936" vmhwm="1856448" />
+        <model path="caffe/FP32/dilation/dilation.xml" test="infer_request_inference" device="GPU" vmsize="2951748" vmpeak="3363804" vmrss="2597940" vmhwm="3080968" />
+        <model path="caffe/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="CPU" vmsize="767157" vmpeak="1369376" vmrss="63338" vmhwm="540166" />
+        <model path="caffe/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="GPU" vmsize="1155101" vmpeak="1701180" vmrss="809938" vmhwm="1420152" />
+        <model path="caffe/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="CPU" vmsize="1299262" vmpeak="1373882" vmrss="431758" vmhwm="540214" />
+        <model path="caffe/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="GPU" vmsize="1647738" vmpeak="1719828" vmrss="1296350" vmhwm="1419092" />
+        <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="1642832" vmrss="14014" vmhwm="789109" />
+        <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="create_exenetwork" device="GPU" vmsize="595430" vmpeak="1690484" vmrss="250496" vmhwm="1409205" />
+        <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1494464" vmpeak="1642832" vmrss="679214" vmhwm="789412" />
+        <model path="caffe/FP32/fcn_alexnet/fcn_alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1450746" vmpeak="1693172" vmrss="1097681" vmhwm="1412254" />
+        <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="919740" vmpeak="1521955" vmrss="234520" vmhwm="792022" />
+        <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1666363" vmpeak="2175012" vmrss="1321245" vmhwm="1893936" />
+        <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1436982" vmpeak="1521955" vmrss="643614" vmhwm="793218" />
+        <model path="caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2138818" vmpeak="2210907" vmrss="1786162" vmhwm="1893760" />
+        <model path="caffe/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="CPU" vmsize="757262" vmpeak="978832" vmrss="81408" vmhwm="124238" />
+        <model path="caffe/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="GPU" vmsize="810590" vmpeak="929139" vmrss="464868" vmhwm="503813" />
+        <model path="caffe/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="CPU" vmsize="928637" vmpeak="1000727" vmrss="130719" vmhwm="130719" />
+        <model path="caffe/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="GPU" vmsize="859478" vmpeak="931568" vmrss="507540" vmhwm="507540" />
+        <model path="caffe/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="CPU" vmsize="766726" vmpeak="925245" vmrss="33382" vmhwm="180268" />
+        <model path="caffe/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="GPU" vmsize="775117" vmpeak="913347" vmrss="430157" vmhwm="605598" />
+        <model path="caffe/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="CPU" vmsize="927163" vmpeak="999253" vmrss="141869" vmhwm="181156" />
+        <model path="caffe/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="GPU" vmsize="924752" vmpeak="996842" vmrss="571590" vmhwm="602839" />
+        <model path="caffe/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="767003" vmpeak="1090526" vmrss="34900" vmhwm="348172" />
+        <model path="caffe/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="948046" vmpeak="1182082" vmrss="602624" vmhwm="900169" />
+        <model path="caffe/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="1051481" vmpeak="1123570" vmrss="257219" vmhwm="348541" />
+        <model path="caffe/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="1187106" vmpeak="1259196" vmrss="834438" vmhwm="902800" />
+        <model path="caffe/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764315" vmpeak="1326938" vmrss="63725" vmhwm="603213" />
+        <model path="caffe/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1183410" vmpeak="1680448" vmrss="837953" vmhwm="1398870" />
+        <model path="caffe/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1227798" vmpeak="1326908" vmrss="438160" vmhwm="602434" />
+        <model path="caffe/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1633997" vmpeak="1706086" vmrss="1281693" vmhwm="1395878" />
+        <model path="caffe/FP32/lenet/lenet.xml" test="create_exenetwork" device="CPU" vmsize="753605" vmpeak="876330" vmrss="15571" vmhwm="29106" />
+        <model path="caffe/FP32/lenet/lenet.xml" test="create_exenetwork" device="GPU" vmsize="566693" vmpeak="658486" vmrss="220783" vmhwm="232452" />
+        <model path="caffe/FP32/lenet/lenet.xml" test="infer_request_inference" device="CPU" vmsize="808486" vmpeak="880576" vmrss="29084" vmhwm="29084" />
+        <model path="caffe/FP32/lenet/lenet.xml" test="infer_request_inference" device="GPU" vmsize="586401" vmpeak="658490" vmrss="232764" vmhwm="232764" />
+        <model path="caffe/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="754864" vmpeak="893692" vmrss="54617" vmhwm="81584" />
+        <model path="caffe/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="642527" vmpeak="750424" vmrss="296678" vmhwm="362300" />
+        <model path="caffe/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="831336" vmpeak="903425" vmrss="85654" vmhwm="85654" />
+        <model path="caffe/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="716047" vmpeak="788136" vmrss="364434" vmhwm="364434" />
+        <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="CPU" vmsize="756813" vmpeak="819698" vmrss="54410" vmhwm="78289" />
+        <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="GPU" vmsize="758705" vmpeak="862466" vmrss="412966" vmhwm="437131" />
+        <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="CPU" vmsize="840967" vmpeak="840967" vmrss="82860" vmhwm="82860" />
+        <model path="caffe/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="GPU" vmsize="787182" vmpeak="859271" vmrss="436801" vmhwm="436801" />
+        <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="CPU" vmsize="753715" vmpeak="876299" vmrss="17512" vmhwm="28402" />
+        <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="GPU" vmsize="583092" vmpeak="674744" vmrss="238220" vmhwm="249722" />
+        <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="CPU" vmsize="808209" vmpeak="808209" vmrss="27865" vmhwm="27865" />
+        <model path="caffe/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="GPU" vmsize="600714" vmpeak="672804" vmrss="246967" vmhwm="246967" />
+        <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="CPU" vmsize="763677" vmpeak="874535" vmrss="13318" vmhwm="35327" />
+        <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="GPU" vmsize="570521" vmpeak="662182" vmrss="224774" vmhwm="351410" />
+        <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="CPU" vmsize="901260" vmpeak="973350" vmrss="108037" vmhwm="108037" />
+        <model path="caffe/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="GPU" vmsize="685115" vmpeak="757204" vmrss="331421" vmhwm="351529" />
+        <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="803228" vmrss="14806" vmhwm="25911" />
+        <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="GPU" vmsize="577280" vmpeak="667673" vmrss="232029" vmhwm="242580" />
+        <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="CPU" vmsize="806102" vmpeak="806102" vmrss="25352" vmhwm="25352" />
+        <model path="caffe/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="GPU" vmsize="593340" vmpeak="665429" vmrss="240200" vmhwm="240200" />
+        <model path="caffe/FP32/openpose_face/openpose_face.xml" test="create_exenetwork" device="CPU" vmsize="764711" vmpeak="1279238" vmrss="23544" vmhwm="528431" />
+        <model path="caffe/FP32/openpose_face/openpose_face.xml" test="create_exenetwork" device="GPU" vmsize="890428" vmpeak="1316884" vmrss="544882" vmhwm="1035192" />
+        <model path="caffe/FP32/openpose_face/openpose_face.xml" test="infer_request_inference" device="CPU" vmsize="1187529" vmpeak="1279207" vmrss="398512" vmhwm="528730" />
+        <model path="caffe/FP32/openpose_face/openpose_face.xml" test="infer_request_inference" device="GPU" vmsize="1288707" vmpeak="1360796" vmrss="935778" vmhwm="1038888" />
+        <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="create_exenetwork" device="CPU" vmsize="755634" vmpeak="1259024" vmrss="23342" vmhwm="507980" />
+        <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="create_exenetwork" device="GPU" vmsize="845886" vmpeak="1297898" vmrss="500957" vmhwm="1016822" />
+        <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="infer_request_inference" device="CPU" vmsize="1327246" vmpeak="1327246" vmrss="384634" vmhwm="507522" />
+        <model path="caffe/FP32/openpose_hand/openpose_hand.xml" test="infer_request_inference" device="GPU" vmsize="1277117" vmpeak="1300490" vmrss="923674" vmhwm="1018956" />
+        <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="create_exenetwork" device="CPU" vmsize="757556" vmpeak="1471373" vmrss="32780" vmhwm="716861" />
+        <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="create_exenetwork" device="GPU" vmsize="1153103" vmpeak="1684306" vmrss="807426" vmhwm="1402513" />
+        <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="infer_request_inference" device="CPU" vmsize="1397686" vmpeak="1471373" vmrss="528620" vmhwm="717728" />
+        <model path="caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml" test="infer_request_inference" device="GPU" vmsize="1597785" vmpeak="1680465" vmrss="1244672" vmhwm="1399217" />
+        <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="1485853" vmrss="14330" vmhwm="773766" />
+        <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="create_exenetwork" device="GPU" vmsize="604573" vmpeak="1684861" vmrss="259556" vmhwm="1403600" />
+        <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="infer_request_inference" device="CPU" vmsize="1311107" vmpeak="1485862" vmrss="528448" vmhwm="773656" />
+        <model path="caffe/FP32/places205_alexnet/places205_alexnet.xml" test="infer_request_inference" device="GPU" vmsize="1346840" vmpeak="1684896" vmrss="993942" vmhwm="1403886" />
+        <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="create_exenetwork" device="CPU" vmsize="757187" vmpeak="831362" vmrss="78795" vmhwm="113814" />
+        <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="create_exenetwork" device="GPU" vmsize="805270" vmpeak="920321" vmrss="460319" vmhwm="495638" />
+        <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="infer_request_inference" device="CPU" vmsize="852781" vmpeak="852781" vmrss="119033" vmhwm="119033" />
+        <model path="caffe/FP32/places205_googlenet/places205_googlenet.xml" test="infer_request_inference" device="GPU" vmsize="847052" vmpeak="919142" vmrss="494916" vmhwm="494916" />
+        <model path="caffe/FP32/resnet_18/resnet_18.xml" test="create_exenetwork" device="CPU" vmsize="754248" vmpeak="925443" vmrss="16878" vmhwm="177663" />
+        <model path="caffe/FP32/resnet_18/resnet_18.xml" test="create_exenetwork" device="GPU" vmsize="657659" vmpeak="799510" vmrss="312070" vmhwm="466153" />
+        <model path="caffe/FP32/resnet_18/resnet_18.xml" test="infer_request_inference" device="CPU" vmsize="920163" vmpeak="920163" vmrss="131859" vmhwm="176726" />
+        <model path="caffe/FP32/resnet_18/resnet_18.xml" test="infer_request_inference" device="GPU" vmsize="775350" vmpeak="847440" vmrss="422919" vmhwm="467610" />
+        <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760584" vmpeak="1338202" vmrss="43243" vmhwm="616928" />
+        <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1104862" vmpeak="1557006" vmrss="759030" vmhwm="1275071" />
+        <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1224172" vmpeak="1338172" vmrss="434944" vmhwm="616849" />
+        <model path="caffe/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1452145" vmpeak="1558106" vmrss="1099428" vmhwm="1276787" />
+        <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="764878" vmpeak="1551919" vmrss="58638" vmhwm="828383" />
+        <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1315120" vmpeak="1977250" vmrss="968858" vmhwm="1694796" />
+        <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1526166" vmpeak="1598256" vmrss="582401" vmhwm="829598" />
+        <model path="caffe/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1804748" vmpeak="1975855" vmrss="1451397" vmhwm="1693419" />
+        <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="create_exenetwork" device="CPU" vmsize="927665" vmpeak="2236845" vmrss="224034" vmhwm="1396458" />
+        <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="create_exenetwork" device="GPU" vmsize="1988676" vmpeak="3156291" vmrss="1643919" vmhwm="2874946" />
+        <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="infer_request_inference" device="CPU" vmsize="2016999" vmpeak="2236955" vmrss="1117754" vmhwm="1396128" />
+        <model path="caffe/FP32/resnet_v1_269/resnet_v1_269.xml" test="infer_request_inference" device="GPU" vmsize="2845849" vmpeak="3165219" vmrss="2493550" vmhwm="2883091" />
+        <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="CPU" vmsize="766101" vmpeak="1079971" vmrss="27359" vmhwm="362142" />
+        <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="GPU" vmsize="834856" vmpeak="1080094" vmrss="490089" vmhwm="799312" />
+        <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="CPU" vmsize="1046381" vmpeak="1118471" vmrss="260528" vmhwm="362203" />
+        <model path="caffe/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="GPU" vmsize="1060109" vmpeak="1132199" vmrss="707876" vmhwm="804108" />
+        <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="create_exenetwork" device="CPU" vmsize="758516" vmpeak="930397" vmrss="40572" vmhwm="194062" />
+        <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="create_exenetwork" device="GPU" vmsize="873061" vmpeak="1013430" vmrss="528167" vmhwm="692564" />
+        <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="infer_request_inference" device="CPU" vmsize="957620" vmpeak="1029710" vmrss="152754" vmhwm="194656" />
+        <model path="caffe/FP32/se_bn_inception/se_bn_inception.xml" test="infer_request_inference" device="GPU" vmsize="1014305" vmpeak="1086395" vmrss="662525" vmhwm="694821" />
+        <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="create_exenetwork" device="CPU" vmsize="759382" vmpeak="1174707" vmrss="39265" vmhwm="401856" />
+        <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="create_exenetwork" device="GPU" vmsize="983083" vmpeak="1257471" vmrss="637335" vmhwm="975444" />
+        <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="infer_request_inference" device="CPU" vmsize="1140730" vmpeak="1174672" vmrss="315977" vmhwm="401508" />
+        <model path="caffe/FP32/se_resnext_50/se_resnext_50.xml" test="infer_request_inference" device="GPU" vmsize="1251214" vmpeak="1323304" vmrss="899034" vmhwm="976474" />
+        <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="create_exenetwork" device="CPU" vmsize="754890" vmpeak="815095" vmrss="28833" vmhwm="43881" />
+        <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="create_exenetwork" device="GPU" vmsize="651974" vmpeak="746719" vmrss="306455" vmhwm="321345" />
+        <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="infer_request_inference" device="CPU" vmsize="824942" vmpeak="897032" vmrss="48567" vmhwm="48567" />
+        <model path="caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml" test="infer_request_inference" device="GPU" vmsize="676328" vmpeak="748418" vmrss="324860" vmhwm="324860" />
+        <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="758212" vmpeak="813208" vmrss="29691" vmhwm="44220" />
+        <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="611789" vmpeak="706534" vmrss="266244" vmhwm="324007" />
+        <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818549" vmpeak="890639" vmrss="47141" vmhwm="47141" />
+        <model path="caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="677705" vmpeak="749795" vmrss="326163" vmhwm="326163" />
+        <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="create_exenetwork" device="CPU" vmsize="757534" vmpeak="911495" vmrss="36445" vmhwm="182050" />
+        <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="create_exenetwork" device="GPU" vmsize="835683" vmpeak="973280" vmrss="490613" vmhwm="658640" />
+        <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="infer_request_inference" device="CPU" vmsize="941076" vmpeak="1013166" vmrss="148222" vmhwm="183185" />
+        <model path="caffe/FP32/ssd_googlenet/ssd_googlenet.xml" test="infer_request_inference" device="GPU" vmsize="989608" vmpeak="1061698" vmrss="637709" vmhwm="661746" />
+        <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="757174" vmpeak="901648" vmrss="73409" vmhwm="106537" />
+        <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="801644" vmpeak="915186" vmrss="456517" vmhwm="490520" />
+        <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="847932" vmpeak="847932" vmrss="116410" vmhwm="116410" />
+        <model path="caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="843022" vmpeak="915112" vmrss="490864" vmhwm="490864" />
+        <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="create_exenetwork" device="CPU" vmsize="765393" vmpeak="900402" vmrss="71544" vmhwm="105032" />
+        <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="create_exenetwork" device="GPU" vmsize="759668" vmpeak="872762" vmrss="414493" vmhwm="497701" />
+        <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="infer_request_inference" device="CPU" vmsize="848438" vmpeak="900754" vmrss="113590" vmhwm="113590" />
+        <model path="caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml" test="infer_request_inference" device="GPU" vmsize="847620" vmpeak="919710" vmrss="495730" vmhwm="495730" />
+        <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="CPU" vmsize="755374" vmpeak="1146156" vmrss="22026" vmhwm="370176" />
+        <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="GPU" vmsize="768451" vmpeak="1074730" vmrss="423662" vmhwm="794266" />
+        <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="CPU" vmsize="1113609" vmpeak="1185698" vmrss="313513" vmhwm="370035" />
+        <model path="caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="GPU" vmsize="1134227" vmpeak="1206317" vmrss="783006" vmhwm="795000" />
+        <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="create_exenetwork" device="CPU" vmsize="755796" vmpeak="1267802" vmrss="23746" vmhwm="383983" />
+        <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="create_exenetwork" device="GPU" vmsize="794565" vmpeak="1272634" vmrss="449394" vmhwm="991632" />
+        <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="infer_request_inference" device="CPU" vmsize="1234050" vmpeak="1306140" vmrss="421194" vmhwm="421194" />
+        <model path="caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml" test="infer_request_inference" device="GPU" vmsize="1348960" vmpeak="1421050" vmrss="999050" vmhwm="999050" />
+        <model path="caffe/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754006" vmpeak="2548497" vmrss="15598" vmhwm="1808624" />
+        <model path="caffe/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="668602" vmpeak="3326708" vmrss="323791" vmhwm="3045328" />
+        <model path="caffe/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2027181" vmpeak="2548497" vmrss="1242560" vmhwm="1808730" />
+        <model path="caffe/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2441076" vmpeak="3326708" vmrss="2088055" vmhwm="3045050" />
+        <model path="caffe/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754212" vmpeak="2618030" vmrss="15510" vmhwm="1877383" />
+        <model path="caffe/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="739222" vmpeak="3397112" vmrss="393866" vmhwm="3115085" />
+        <model path="caffe/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2073794" vmpeak="2618030" vmrss="1289741" vmhwm="1878289" />
+        <model path="caffe/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2518340" vmpeak="3397081" vmrss="2165196" vmhwm="3114975" />
+        <model path="caffe/FP32/vnect/vnect.xml" test="create_exenetwork" device="CPU" vmsize="764940" vmpeak="947157" vmrss="27988" vmhwm="223726" />
+        <model path="caffe/FP32/vnect/vnect.xml" test="create_exenetwork" device="GPU" vmsize="789223" vmpeak="941683" vmrss="443788" vmhwm="641476" />
+        <model path="caffe/FP32/vnect/vnect.xml" test="infer_request_inference" device="CPU" vmsize="962187" vmpeak="1034277" vmrss="177848" vmhwm="224180" />
+        <model path="caffe/FP32/vnect/vnect.xml" test="infer_request_inference" device="GPU" vmsize="969069" vmpeak="1041158" vmrss="616990" vmhwm="641977" />
+        <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="create_exenetwork" device="CPU" vmsize="755651" vmpeak="1654985" vmrss="24921" vmhwm="920400" />
+        <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="create_exenetwork" device="GPU" vmsize="936892" vmpeak="1838610" vmrss="590994" vmhwm="1556526" />
+        <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="infer_request_inference" device="CPU" vmsize="1433352" vmpeak="1654989" vmrss="639456" vmhwm="918693" />
+        <model path="caffe/FP32/wrn_50_2/wrn_50_2.xml" test="infer_request_inference" device="GPU" vmsize="1613176" vmpeak="1824922" vmrss="1259940" vmhwm="1543031" />
+        <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="CPU" vmsize="754692" vmpeak="4259393" vmrss="18013" vmhwm="3532412" />
+        <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="GPU" vmsize="719105" vmpeak="5906194" vmrss="373648" vmhwm="5623600" />
+        <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="CPU" vmsize="3167040" vmpeak="4259380" vmrss="2378362" vmhwm="3531237" />
+        <model path="caffe/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="GPU" vmsize="4165801" vmpeak="5903801" vmrss="3812393" vmhwm="5621585" />
+        <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753860" vmpeak="1101161" vmrss="14599" vmhwm="375399" />
+        <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="GPU" vmsize="577640" vmpeak="1037480" vmrss="232443" vmhwm="755972" />
+        <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1059828" vmpeak="1131917" vmrss="272879" vmhwm="374721" />
+        <model path="caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="GPU" vmsize="957453" vmpeak="1037445" vmrss="605026" vmhwm="756606" />
+        <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1422647" vmrss="16790" vmhwm="680072" />
+        <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="GPU" vmsize="678964" vmpeak="1435790" vmrss="334017" vmhwm="1154573" />
+        <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="CPU" vmsize="1279823" vmpeak="1422647" vmrss="490692" vmhwm="680526" />
+        <model path="caffe/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="GPU" vmsize="1325156" vmpeak="1438571" vmrss="972140" vmhwm="1157138" />
+        <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753733" vmpeak="954430" vmrss="14278" vmhwm="229913" />
+        <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="create_exenetwork" device="GPU" vmsize="568880" vmpeak="814976" vmrss="223907" vmhwm="533808" />
+        <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1032882" vmpeak="1032882" vmrss="174631" vmhwm="230243" />
+        <model path="caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml" test="infer_request_inference" device="GPU" vmsize="810031" vmpeak="816178" vmrss="456856" vmhwm="534503" />
+        <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="CPU" vmsize="756852" vmpeak="1587154" vmrss="31460" vmhwm="837570" />
+        <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="GPU" vmsize="1159840" vmpeak="1822444" vmrss="813969" vmhwm="1540343" />
+        <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="CPU" vmsize="1554462" vmpeak="1626552" vmrss="609677" vmhwm="836655" />
+        <model path="caffe/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="GPU" vmsize="1735610" vmpeak="1821749" vmrss="1383285" vmhwm="1540598" />
+        <model path="mxnet/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="CPU" vmsize="753856" vmpeak="1528538" vmrss="14414" vmhwm="815491" />
+        <model path="mxnet/FP32/caffenet/caffenet.xml" test="create_exenetwork" device="GPU" vmsize="580030" vmpeak="1741062" vmrss="235624" vmhwm="1460386" />
+        <model path="mxnet/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="CPU" vmsize="1339681" vmpeak="1528538" vmrss="556146" vmhwm="815262" />
+        <model path="mxnet/FP32/caffenet/caffenet.xml" test="infer_request_inference" device="GPU" vmsize="1389097" vmpeak="1741093" vmrss="1036178" vmhwm="1460060" />
+        <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772622" vmpeak="985749" vmrss="95431" vmhwm="151087" />
+        <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1141962" vmpeak="1252068" vmrss="796734" vmhwm="827217" />
+        <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985239" vmpeak="1057328" vmrss="158532" vmhwm="158532" />
+        <model path="mxnet/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1171425" vmpeak="1243514" vmrss="818624" vmhwm="818624" />
+        <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="CPU" vmsize="762731" vmpeak="1211720" vmrss="93486" vmhwm="426896" />
+        <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="create_exenetwork" device="GPU" vmsize="1312801" vmpeak="1592839" vmrss="967252" vmhwm="1311569" />
+        <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="CPU" vmsize="1198124" vmpeak="1270214" vmrss="353051" vmhwm="427319" />
+        <model path="mxnet/FP32/densenet_161/densenet_161.xml" test="infer_request_inference" device="GPU" vmsize="1657339" vmpeak="1729428" vmrss="1304820" vmhwm="1304820" />
+        <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="796360" vmpeak="1002408" vmrss="123094" vmhwm="239945" />
+        <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1352916" vmpeak="1472262" vmrss="1007630" vmhwm="1084727" />
+        <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1059880" vmpeak="1059880" vmrss="239307" vmhwm="241753" />
+        <model path="mxnet/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1437656" vmpeak="1509745" vmrss="1084828" vmhwm="1084828" />
+        <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="CPU" vmsize="864635" vmpeak="1154040" vmrss="148830" vmhwm="322528" />
+        <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="create_exenetwork" device="GPU" vmsize="1505042" vmpeak="1650162" vmrss="1159906" vmhwm="1343711" />
+        <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="CPU" vmsize="1181056" vmpeak="1253146" vmrss="315048" vmhwm="322282" />
+        <model path="mxnet/FP32/densenet_201/densenet_201.xml" test="infer_request_inference" device="GPU" vmsize="1719256" vmpeak="1791345" vmrss="1366767" vmhwm="1366767" />
+        <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="CPU" vmsize="767976" vmpeak="1370195" vmrss="63456" vmhwm="539897" />
+        <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="create_exenetwork" device="GPU" vmsize="1313452" vmpeak="1701664" vmrss="968145" vmhwm="1420434" />
+        <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="CPU" vmsize="1295571" vmpeak="1370195" vmrss="430610" vmhwm="539536" />
+        <model path="mxnet/FP32/dpn_92/dpn_92.xml" test="infer_request_inference" device="GPU" vmsize="1651421" vmpeak="1723510" vmrss="1299738" vmhwm="1422326" />
+        <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754212" vmpeak="3124338" vmrss="17362" vmhwm="1770388" />
+        <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="create_exenetwork" device="GPU" vmsize="669583" vmpeak="3628222" vmrss="324363" vmhwm="3347071" />
+        <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2705824" vmpeak="3124338" vmrss="1906933" vmhwm="1906933" />
+        <model path="mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml" test="infer_request_inference" device="GPU" vmsize="3710449" vmpeak="3782539" vmrss="3356861" vmhwm="3356861" />
+        <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="create_exenetwork" device="CPU" vmsize="756870" vmpeak="1192276" vmrss="32300" vmhwm="470417" />
+        <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="create_exenetwork" device="GPU" vmsize="772970" vmpeak="1363872" vmrss="428054" vmhwm="1079412" />
+        <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="infer_request_inference" device="CPU" vmsize="1123746" vmpeak="1195836" vmrss="335288" vmhwm="470162" />
+        <model path="mxnet/FP32/full_imagenet_network/full_imagenet_network.xml" test="infer_request_inference" device="GPU" vmsize="1219618" vmpeak="1362376" vmrss="875415" vmhwm="1077560" />
+        <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="848157" vmpeak="1522730" vmrss="178424" vmhwm="792470" />
+        <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1549574" vmpeak="2182501" vmrss="1203804" vmhwm="1900742" />
+        <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1437730" vmpeak="1522730" vmrss="644402" vmhwm="794024" />
+        <model path="mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2145426" vmpeak="2217516" vmrss="1793162" vmhwm="1899854" />
+        <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="756584" vmpeak="925636" vmrss="32982" vmhwm="182529" />
+        <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="769230" vmpeak="907847" vmrss="423874" vmhwm="604982" />
+        <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="928659" vmpeak="928659" vmrss="142304" vmhwm="182353" />
+        <model path="mxnet/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="926103" vmpeak="998192" vmrss="572985" vmhwm="603592" />
+        <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="create_exenetwork" device="CPU" vmsize="757851" vmpeak="1078682" vmrss="34751" vmhwm="348154" />
+        <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="create_exenetwork" device="GPU" vmsize="911473" vmpeak="1183102" vmrss="565549" vmhwm="900992" />
+        <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="infer_request_inference" device="CPU" vmsize="1051652" vmpeak="1123742" vmrss="258231" vmhwm="349131" />
+        <model path="mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml" test="infer_request_inference" device="GPU" vmsize="1182570" vmpeak="1254660" vmrss="829659" vmhwm="899540" />
+        <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764319" vmpeak="1327506" vmrss="61375" vmhwm="601048" />
+        <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1206559" vmpeak="1676272" vmrss="860362" vmhwm="1393906" />
+        <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1228396" vmpeak="1327475" vmrss="441135" vmhwm="603394" />
+        <model path="mxnet/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1637486" vmpeak="1709576" vmrss="1285376" vmhwm="1398377" />
+        <model path="mxnet/FP32/location_net/location_net.xml" test="create_exenetwork" device="CPU" vmsize="761046" vmpeak="1754029" vmrss="43916" vmhwm="1002368" />
+        <model path="mxnet/FP32/location_net/location_net.xml" test="create_exenetwork" device="GPU" vmsize="1026110" vmpeak="2108686" vmrss="680191" vmhwm="1826792" />
+        <model path="mxnet/FP32/location_net/location_net.xml" test="infer_request_inference" device="CPU" vmsize="1512095" vmpeak="1753998" vmrss="701483" vmhwm="1002333" />
+        <model path="mxnet/FP32/location_net/location_net.xml" test="infer_request_inference" device="GPU" vmsize="1880973" vmpeak="2110306" vmrss="1532348" vmhwm="1828952" />
+        <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="create_exenetwork" device="CPU" vmsize="759695" vmpeak="1636430" vmrss="38011" vmhwm="883225" />
+        <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="create_exenetwork" device="GPU" vmsize="1118880" vmpeak="1994964" vmrss="773102" vmhwm="1713034" />
+        <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="infer_request_inference" device="CPU" vmsize="1430871" vmpeak="1636434" vmrss="617078" vmhwm="882886" />
+        <model path="mxnet/FP32/lresnet100e/lresnet100e.xml" test="infer_request_inference" device="GPU" vmsize="1804484" vmpeak="1993530" vmrss="1450724" vmhwm="1711340" />
+        <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="CPU" vmsize="754872" vmpeak="821893" vmrss="55070" vmhwm="82354" />
+        <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="create_exenetwork" device="GPU" vmsize="626304" vmpeak="734201" vmrss="280918" vmhwm="362925" />
+        <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="CPU" vmsize="831344" vmpeak="903434" vmrss="86495" vmhwm="86495" />
+        <model path="mxnet/FP32/mobilenet/mobilenet.xml" test="infer_request_inference" device="GPU" vmsize="718357" vmpeak="790446" vmrss="367096" vmhwm="367096" />
+        <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="CPU" vmsize="756826" vmpeak="819711" vmrss="53961" vmhwm="77206" />
+        <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="create_exenetwork" device="GPU" vmsize="758023" vmpeak="861784" vmrss="412702" vmhwm="436805" />
+        <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="CPU" vmsize="836470" vmpeak="891765" vmrss="83050" vmhwm="83050" />
+        <model path="mxnet/FP32/mobilenet_v2/mobilenet_v2.xml" test="infer_request_inference" device="GPU" vmsize="788986" vmpeak="861075" vmrss="437646" vmhwm="437646" />
+        <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="CPU" vmsize="762731" vmpeak="804491" vmrss="17490" vmhwm="28454" />
+        <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="create_exenetwork" device="GPU" vmsize="578894" vmpeak="670546" vmrss="233547" vmhwm="245172" />
+        <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="CPU" vmsize="808209" vmpeak="808209" vmrss="28314" vmhwm="28314" />
+        <model path="mxnet/FP32/mtcnn_o/mtcnn_o.xml" test="infer_request_inference" device="GPU" vmsize="600507" vmpeak="672597" vmrss="247596" vmhwm="247596" />
+        <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="CPU" vmsize="753530" vmpeak="881588" vmrss="13208" vmhwm="35261" />
+        <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="create_exenetwork" device="GPU" vmsize="570042" vmpeak="661702" vmrss="224870" vmhwm="353003" />
+        <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="CPU" vmsize="901260" vmpeak="901260" vmrss="107390" vmhwm="107390" />
+        <model path="mxnet/FP32/mtcnn_p/mtcnn_p.xml" test="infer_request_inference" device="GPU" vmsize="686408" vmpeak="758498" vmrss="332895" vmhwm="351907" />
+        <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="CPU" vmsize="753711" vmpeak="803228" vmrss="14546" vmhwm="25586" />
+        <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="create_exenetwork" device="GPU" vmsize="577288" vmpeak="667682" vmrss="231642" vmhwm="242167" />
+        <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="CPU" vmsize="806102" vmpeak="806102" vmrss="24468" vmhwm="24468" />
+        <model path="mxnet/FP32/mtcnn_r/mtcnn_r.xml" test="infer_request_inference" device="GPU" vmsize="595588" vmpeak="667678" vmrss="242246" vmhwm="242246" />
+        <model path="mxnet/FP32/nin/nin.xml" test="create_exenetwork" device="CPU" vmsize="753838" vmpeak="907420" vmrss="80674" vmhwm="122086" />
+        <model path="mxnet/FP32/nin/nin.xml" test="create_exenetwork" device="GPU" vmsize="675633" vmpeak="798283" vmrss="330184" vmhwm="372754" />
+        <model path="mxnet/FP32/nin/nin.xml" test="infer_request_inference" device="CPU" vmsize="841390" vmpeak="913479" vmrss="123776" vmhwm="123776" />
+        <model path="mxnet/FP32/nin/nin.xml" test="infer_request_inference" device="GPU" vmsize="726066" vmpeak="798155" vmrss="390764" vmhwm="390764" />
+        <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754080" vmpeak="884950" vmrss="35930" vmhwm="56368" />
+        <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="create_exenetwork" device="GPU" vmsize="613082" vmpeak="713020" vmrss="267753" vmhwm="358019" />
+        <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="infer_request_inference" device="CPU" vmsize="847726" vmpeak="919815" vmrss="83300" vmhwm="83300" />
+        <model path="mxnet/FP32/nst_vgg19/nst_vgg19.xml" test="infer_request_inference" device="GPU" vmsize="710754" vmpeak="782843" vmrss="357442" vmhwm="357442" />
+        <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760821" vmpeak="1370292" vmrss="44242" vmhwm="618965" />
+        <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1077643" vmpeak="1594964" vmrss="731733" vmhwm="1313127" />
+        <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1256200" vmpeak="1370261" vmrss="444043" vmhwm="617852" />
+        <model path="mxnet/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1494732" vmpeak="1596218" vmrss="1141690" vmhwm="1314187" />
+        <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="765322" vmpeak="1593790" vmrss="61120" vmhwm="831661" />
+        <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1339184" vmpeak="2040148" vmrss="993968" vmhwm="1758746" />
+        <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1414652" vmpeak="1593754" vmrss="594426" vmhwm="832220" />
+        <model path="mxnet/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1871271" vmpeak="2037904" vmrss="1518501" vmhwm="1756343" />
+        <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="CPU" vmsize="760650" vmpeak="1369557" vmrss="43384" vmhwm="618015" />
+        <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="GPU" vmsize="1022863" vmpeak="1592206" vmrss="676698" vmhwm="1309880" />
+        <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="CPU" vmsize="1255557" vmpeak="1369522" vmrss="445350" vmhwm="618750" />
+        <model path="mxnet/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="GPU" vmsize="1490077" vmpeak="1591563" vmrss="1137444" vmhwm="1309910" />
+        <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="CPU" vmsize="765204" vmpeak="1593108" vmrss="61124" vmhwm="831353" />
+        <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="GPU" vmsize="1340754" vmpeak="2034586" vmrss="995636" vmhwm="1753100" />
+        <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="CPU" vmsize="1413992" vmpeak="1593077" vmrss="592710" vmhwm="831098" />
+        <model path="mxnet/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="GPU" vmsize="1867096" vmpeak="2036610" vmrss="1514532" vmhwm="1755089" />
+        <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="create_exenetwork" device="CPU" vmsize="766911" vmpeak="1356080" vmrss="64389" vmhwm="623026" />
+        <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="create_exenetwork" device="GPU" vmsize="1105068" vmpeak="1552320" vmrss="759990" vmhwm="1271340" />
+        <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="infer_request_inference" device="CPU" vmsize="1258699" vmpeak="1356084" vmrss="468780" vmhwm="623788" />
+        <model path="mxnet/FP32/resnext_101/resnext_101.xml" test="infer_request_inference" device="GPU" vmsize="1478730" vmpeak="1553591" vmrss="1126364" vmhwm="1272167" />
+        <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="create_exenetwork" device="CPU" vmsize="761239" vmpeak="1894468" vmrss="40691" vmhwm="1139410" />
+        <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="create_exenetwork" device="GPU" vmsize="1418938" vmpeak="2248351" vmrss="1073886" vmhwm="1967262" />
+        <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="infer_request_inference" device="CPU" vmsize="1618592" vmpeak="1894499" vmrss="810946" vmhwm="1140422" />
+        <model path="mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml" test="infer_request_inference" device="GPU" vmsize="1996112" vmpeak="2247322" vmrss="1660700" vmhwm="1965405" />
+        <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="754987" vmpeak="880664" vmrss="29475" vmhwm="43832" />
+        <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="616360" vmpeak="711106" vmrss="270859" vmhwm="322498" />
+        <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818562" vmpeak="818562" vmrss="47141" vmhwm="47141" />
+        <model path="mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="674124" vmpeak="746213" vmrss="322731" vmhwm="322731" />
+        <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="CPU" vmsize="755224" vmpeak="1146433" vmrss="21806" vmhwm="370044" />
+        <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="create_exenetwork" device="GPU" vmsize="775324" vmpeak="1077709" vmrss="430342" vmhwm="796857" />
+        <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="CPU" vmsize="1113904" vmpeak="1185993" vmrss="312527" vmhwm="370946" />
+        <model path="mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml" test="infer_request_inference" device="GPU" vmsize="1137391" vmpeak="1137391" vmrss="785391" vmhwm="793201" />
+        <model path="mxnet/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754133" vmpeak="2548906" vmrss="14955" vmhwm="1807044" />
+        <model path="mxnet/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="668619" vmpeak="3326725" vmrss="322691" vmhwm="3044404" />
+        <model path="mxnet/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2027476" vmpeak="2548906" vmrss="1242678" vmhwm="1808470" />
+        <model path="mxnet/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2438563" vmpeak="3326725" vmrss="2085028" vmhwm="3044505" />
+        <model path="mxnet/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754226" vmpeak="2618325" vmrss="15708" vmhwm="1877977" />
+        <model path="mxnet/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="741092" vmpeak="3397116" vmrss="396074" vmhwm="3115345" />
+        <model path="mxnet/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2074089" vmpeak="2618325" vmrss="1290049" vmhwm="1878672" />
+        <model path="mxnet/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2518436" vmpeak="3397178" vmrss="2165728" vmhwm="3115459" />
+        <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="CPU" vmsize="754701" vmpeak="4259684" vmrss="17626" vmhwm="3531853" />
+        <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="create_exenetwork" device="GPU" vmsize="747582" vmpeak="5921322" vmrss="402490" vmhwm="5639084" />
+        <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="CPU" vmsize="3095241" vmpeak="4259670" vmrss="2379062" vmhwm="3530652" />
+        <model path="mxnet/FP32/yolo_v1_full/yolo_v1_full.xml" test="infer_request_inference" device="GPU" vmsize="4163667" vmpeak="5923566" vmrss="3810193" vmhwm="5640967" />
+        <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="CPU" vmsize="754023" vmpeak="1334414" vmrss="15254" vmhwm="608322" />
+        <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="create_exenetwork" device="GPU" vmsize="600701" vmpeak="1330978" vmrss="255912" vmhwm="1049844" />
+        <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="CPU" vmsize="1215838" vmpeak="1334383" vmrss="428331" vmhwm="607442" />
+        <model path="mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml" test="infer_request_inference" device="GPU" vmsize="1199972" vmpeak="1330384" vmrss="847391" vmhwm="1049228" />
+        <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="create_exenetwork" device="CPU" vmsize="755387" vmpeak="1175570" vmrss="25374" vmhwm="306904" />
+        <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="create_exenetwork" device="GPU" vmsize="805222" vmpeak="1346307" vmrss="460781" vmhwm="1065873" />
+        <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="infer_request_inference" device="CPU" vmsize="1188580" vmpeak="1260670" vmrss="336036" vmhwm="336036" />
+        <model path="onnx/FP32/ssd_resnet34/ssd_resnet34.xml" test="infer_request_inference" device="GPU" vmsize="1449408" vmpeak="1521498" vmrss="1096792" vmhwm="1096792" />
+        <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="create_exenetwork" device="CPU" vmsize="756822" vmpeak="1181615" vmrss="28468" vmhwm="309716" />
+        <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="create_exenetwork" device="GPU" vmsize="819271" vmpeak="2432738" vmrss="474764" vmhwm="1101047" />
+        <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="infer_request_inference" device="CPU" vmsize="1189117" vmpeak="1261207" vmrss="333788" vmhwm="333788" />
+        <model path="onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml" test="infer_request_inference" device="GPU" vmsize="2539222" vmpeak="2611312" vmrss="2191604" vmhwm="2191604" />
+        <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="757878" vmpeak="1077934" vmrss="35261" vmhwm="348964" />
+        <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="899610" vmpeak="1179116" vmrss="553863" vmhwm="896997" />
+        <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="1050878" vmpeak="1077876" vmrss="256506" vmhwm="347974" />
+        <model path="pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="1179239" vmpeak="1251329" vmrss="826553" vmhwm="897714" />
+        <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="760456" vmpeak="1096708" vmrss="27315" vmhwm="361944" />
+        <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="834275" vmpeak="1073569" vmrss="489086" vmhwm="792343" />
+        <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="1058622" vmpeak="1130712" vmrss="267682" vmhwm="362749" />
+        <model path="pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="1050852" vmpeak="1122941" vmrss="697576" vmhwm="791040" />
+        <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="create_exenetwork" device="CPU" vmsize="755950" vmpeak="1092203" vmrss="27640" vmhwm="362740" />
+        <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="create_exenetwork" device="GPU" vmsize="835951" vmpeak="1073516" vmrss="490674" vmhwm="792224" />
+        <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="infer_request_inference" device="CPU" vmsize="1058626" vmpeak="1130716" vmrss="266516" vmhwm="361992" />
+        <model path="pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml" test="infer_request_inference" device="GPU" vmsize="1050218" vmpeak="1071435" vmrss="696669" vmhwm="789848" />
+        <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="create_exenetwork" device="CPU" vmsize="754872" vmpeak="880550" vmrss="29603" vmhwm="43212" />
+        <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="create_exenetwork" device="GPU" vmsize="648881" vmpeak="743626" vmrss="303424" vmhwm="318348" />
+        <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="infer_request_inference" device="CPU" vmsize="818246" vmpeak="818246" vmrss="46534" vmhwm="46534" />
+        <model path="pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml" test="infer_request_inference" device="GPU" vmsize="674146" vmpeak="746235" vmrss="320315" vmhwm="320315" />
+        <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="create_exenetwork" device="CPU" vmsize="764755" vmpeak="2092574" vmrss="38016" vmhwm="1352450" />
+        <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="create_exenetwork" device="GPU" vmsize="1578328" vmpeak="3355976" vmrss="1233474" vmhwm="3074953" />
+        <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="infer_request_inference" device="CPU" vmsize="1802838" vmpeak="2092587" vmrss="994188" vmhwm="1352709" />
+        <model path="tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml" test="infer_request_inference" device="GPU" vmsize="2958472" vmpeak="3352694" vmrss="2607677" vmhwm="3072185" />
+        <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="create_exenetwork" device="CPU" vmsize="765124" vmpeak="2035453" vmrss="39745" vmhwm="1292420" />
+        <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="create_exenetwork" device="GPU" vmsize="1939801" vmpeak="3261715" vmrss="1594617" vmhwm="2980577" />
+        <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="infer_request_inference" device="CPU" vmsize="1750196" vmpeak="2039945" vmrss="935774" vmhwm="1291963" />
+        <model path="tf/1.14.0/FP32/bert_xnli/bert_xnli.xml" test="infer_request_inference" device="GPU" vmsize="2902235" vmpeak="3265460" vmrss="2551727" vmhwm="2984352" />
+        <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="create_exenetwork" device="CPU" vmsize="757587" vmpeak="1547678" vmrss="33004" vmhwm="718973" />
+        <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="create_exenetwork" device="GPU" vmsize="1154670" vmpeak="1678943" vmrss="809811" vmhwm="1398284" />
+        <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="infer_request_inference" device="CPU" vmsize="1553134" vmpeak="1553134" vmrss="606232" vmhwm="719791" />
+        <model path="tf/1.14.0/FP32/cmu/cmu.xml" test="infer_request_inference" device="GPU" vmsize="1753910" vmpeak="1826000" vmrss="1400234" vmhwm="1400234" />
+        <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="create_exenetwork" device="CPU" vmsize="757160" vmpeak="867486" vmrss="41307" vmhwm="62678" />
+        <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="create_exenetwork" device="GPU" vmsize="743283" vmpeak="841055" vmrss="398604" vmhwm="537209" />
+        <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="infer_request_inference" device="CPU" vmsize="888087" vmpeak="960176" vmrss="114166" vmhwm="114166" />
+        <model path="tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml" test="infer_request_inference" device="GPU" vmsize="894339" vmpeak="966429" vmrss="541912" vmhwm="541912" />
+        <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="CPU" vmsize="772728" vmpeak="951218" vmrss="95840" vmhwm="151676" />
+        <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="create_exenetwork" device="GPU" vmsize="1135195" vmpeak="1245301" vmrss="789848" vmhwm="820410" />
+        <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="CPU" vmsize="985450" vmpeak="1057540" vmrss="159046" vmhwm="159046" />
+        <model path="tf/1.14.0/FP32/densenet_121/densenet_121.xml" test="infer_request_inference" device="GPU" vmsize="1171152" vmpeak="1243242" vmrss="818598" vmhwm="818598" />
+        <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="CPU" vmsize="864168" vmpeak="998263" vmrss="126266" vmhwm="241604" />
+        <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="create_exenetwork" device="GPU" vmsize="1353237" vmpeak="1472583" vmrss="1007978" vmhwm="1094614" />
+        <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="CPU" vmsize="1060316" vmpeak="1132406" vmrss="238326" vmhwm="240724" />
+        <model path="tf/1.14.0/FP32/densenet_169/densenet_169.xml" test="infer_request_inference" device="GPU" vmsize="1447146" vmpeak="1519236" vmrss="1094759" vmhwm="1097835" />
+        <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="create_exenetwork" device="CPU" vmsize="757156" vmpeak="826843" vmrss="69031" vmhwm="100887" />
+        <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="create_exenetwork" device="GPU" vmsize="796250" vmpeak="906813" vmrss="451171" vmhwm="482077" />
+        <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="infer_request_inference" device="CPU" vmsize="849041" vmpeak="849041" vmrss="104464" vmhwm="104464" />
+        <model path="tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml" test="infer_request_inference" device="GPU" vmsize="833984" vmpeak="906074" vmrss="481786" vmhwm="481786" />
+        <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="create_exenetwork" device="CPU" vmsize="760786" vmpeak="1139173" vmrss="66413" vmhwm="353346" />
+        <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="create_exenetwork" device="GPU" vmsize="1055560" vmpeak="1255601" vmrss="710595" vmhwm="974815" />
+        <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="infer_request_inference" device="CPU" vmsize="1097984" vmpeak="1170074" vmrss="281050" vmhwm="352228" />
+        <model path="tf/1.14.0/FP32/facenet/facenet.xml" test="infer_request_inference" device="GPU" vmsize="1259253" vmpeak="1331343" vmrss="906562" vmhwm="976483" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml" test="create_exenetwork" device="CPU" vmsize="920884" vmpeak="2443892" vmrss="237186" vmhwm="851215" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml" test="create_exenetwork" device="GPU" vmsize="1751376" vmpeak="4164239" vmrss="1406411" vmhwm="3883422" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml" test="create_exenetwork" device="CPU" vmsize="757323" vmpeak="986519" vmrss="35006" vmhwm="212911" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml" test="create_exenetwork" device="GPU" vmsize="862219" vmpeak="1179283" vmrss="516881" vmhwm="897930" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml" test="create_exenetwork" device="CPU" vmsize="761538" vmpeak="1491811" vmrss="45667" vmhwm="671554" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml" test="create_exenetwork" device="GPU" vmsize="1126884" vmpeak="1800550" vmrss="781739" vmhwm="1519302" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml" test="create_exenetwork" device="CPU" vmsize="766964" vmpeak="1233342" vmrss="29568" vmhwm="415509" />
+        <model path="tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml" test="create_exenetwork" device="GPU" vmsize="897432" vmpeak="1347007" vmrss="553357" vmhwm="1067290" />
+        <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="create_exenetwork" device="CPU" vmsize="756562" vmpeak="1099533" vmrss="30078" vmhwm="245590" />
+        <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="create_exenetwork" device="GPU" vmsize="764170" vmpeak="1353149" vmrss="419267" vmhwm="1072244" />
+        <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="infer_request_inference" device="CPU" vmsize="1478496" vmpeak="1478496" vmrss="332820" vmhwm="332820" />
+        <model path="tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml" test="infer_request_inference" device="GPU" vmsize="1423364" vmpeak="1495454" vmrss="1070973" vmhwm="1172441" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="create_exenetwork" device="CPU" vmsize="755092" vmpeak="815298" vmrss="28811" vmhwm="43687" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="create_exenetwork" device="GPU" vmsize="620734" vmpeak="715479" vmrss="274991" vmhwm="324935" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="infer_request_inference" device="CPU" vmsize="825268" vmpeak="825268" vmrss="48439" vmhwm="48439" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml" test="infer_request_inference" device="GPU" vmsize="680592" vmpeak="752681" vmrss="326972" vmhwm="326972" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="765182" vmpeak="880712" vmrss="29827" vmhwm="44149" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="612620" vmpeak="707366" vmrss="266855" vmhwm="323734" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="818879" vmpeak="818879" vmrss="46534" vmhwm="46534" />
+        <model path="tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="681010" vmpeak="753099" vmrss="326902" vmhwm="326902" />
+        <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="CPU" vmsize="848056" vmpeak="1522360" vmrss="147382" vmhwm="794481" />
+        <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="create_exenetwork" device="GPU" vmsize="1699992" vmpeak="2187231" vmrss="1354892" vmhwm="1906344" />
+        <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="CPU" vmsize="1437365" vmpeak="1522364" vmrss="643724" vmhwm="793755" />
+        <model path="tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml" test="infer_request_inference" device="GPU" vmsize="2152515" vmpeak="2224604" vmrss="1800026" vmhwm="1900395" />
+        <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="CPU" vmsize="757526" vmpeak="905132" vmrss="83195" vmhwm="119653" />
+        <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="create_exenetwork" device="GPU" vmsize="815988" vmpeak="932663" vmrss="470742" vmhwm="507760" />
+        <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="CPU" vmsize="1007820" vmpeak="1007820" vmrss="123926" vmhwm="123926" />
+        <model path="tf/1.14.0/FP32/inception_v1/inception_v1.xml" test="infer_request_inference" device="GPU" vmsize="861520" vmpeak="933609" vmrss="507870" vmhwm="507870" />
+        <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="CPU" vmsize="756756" vmpeak="925425" vmrss="34007" vmhwm="180769" />
+        <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="create_exenetwork" device="GPU" vmsize="824168" vmpeak="962403" vmrss="478737" vmhwm="610280" />
+        <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="CPU" vmsize="927669" vmpeak="999759" vmrss="141772" vmhwm="181966" />
+        <model path="tf/1.14.0/FP32/inception_v2/inception_v2.xml" test="infer_request_inference" device="GPU" vmsize="936755" vmpeak="1008845" vmrss="583963" vmhwm="611516" />
+        <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="CPU" vmsize="759013" vmpeak="1063559" vmrss="51255" vmhwm="349113" />
+        <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="create_exenetwork" device="GPU" vmsize="925958" vmpeak="1184101" vmrss="580056" vmhwm="902325" />
+        <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="CPU" vmsize="1043583" vmpeak="1115672" vmrss="263520" vmhwm="349034" />
+        <model path="tf/1.14.0/FP32/inception_v3/inception_v3.xml" test="infer_request_inference" device="GPU" vmsize="1189548" vmpeak="1261638" vmrss="836646" vmhwm="903676" />
+        <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="CPU" vmsize="764574" vmpeak="1327493" vmrss="64108" vmhwm="603842" />
+        <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="create_exenetwork" device="GPU" vmsize="1221717" vmpeak="1686643" vmrss="875617" vmhwm="1404475" />
+        <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="CPU" vmsize="1381556" vmpeak="1403402" vmrss="440356" vmhwm="602751" />
+        <model path="tf/1.14.0/FP32/inception_v4/inception_v4.xml" test="infer_request_inference" device="GPU" vmsize="1641921" vmpeak="1714011" vmrss="1289340" vmhwm="1405430" />
+        <model path="tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml" test="create_exenetwork" device="CPU" vmsize="762119" vmpeak="2738828" vmrss="47203" vmhwm="947557" />
+        <model path="tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml" test="create_exenetwork" device="GPU" vmsize="1295483" vmpeak="4189812" vmrss="949788" vmhwm="3908550" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="create_exenetwork" device="CPU" vmsize="763840" vmpeak="805556" vmrss="21938" vmhwm="33264" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="create_exenetwork" device="GPU" vmsize="652572" vmpeak="744180" vmrss="306754" vmhwm="318432" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="infer_request_inference" device="CPU" vmsize="814000" vmpeak="814000" vmrss="33391" vmhwm="33391" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml" test="infer_request_inference" device="GPU" vmsize="672144" vmpeak="744233" vmrss="319026" vmhwm="319026" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="create_exenetwork" device="CPU" vmsize="754705" vmpeak="881188" vmrss="29282" vmhwm="44836" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="create_exenetwork" device="GPU" vmsize="614209" vmpeak="709759" vmrss="268778" vmhwm="326845" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="infer_request_inference" device="CPU" vmsize="818228" vmpeak="890318" vmrss="45513" vmhwm="45513" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml" test="infer_request_inference" device="GPU" vmsize="682484" vmpeak="754573" vmrss="328966" vmhwm="328966" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="create_exenetwork" device="CPU" vmsize="754903" vmpeak="821928" vmrss="55237" vmhwm="82768" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="create_exenetwork" device="GPU" vmsize="643887" vmpeak="751788" vmrss="298685" vmhwm="367602" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="infer_request_inference" device="CPU" vmsize="831111" vmpeak="831111" vmrss="86732" vmhwm="86732" />
+        <model path="tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml" test="infer_request_inference" device="GPU" vmsize="720979" vmpeak="793069" vmrss="367584" vmhwm="367584" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="create_exenetwork" device="CPU" vmsize="756870" vmpeak="819759" vmrss="54586" vmhwm="78570" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="create_exenetwork" device="GPU" vmsize="705724" vmpeak="809490" vmrss="360267" vmhwm="435512" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="infer_request_inference" device="CPU" vmsize="835978" vmpeak="835978" vmrss="82583" vmhwm="82583" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml" test="infer_request_inference" device="GPU" vmsize="788902" vmpeak="860992" vmrss="435727" vmhwm="435727" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="create_exenetwork" device="CPU" vmsize="756725" vmpeak="831080" vmrss="76414" vmhwm="111914" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="create_exenetwork" device="GPU" vmsize="787058" vmpeak="902290" vmrss="441399" vmhwm="476911" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="infer_request_inference" device="CPU" vmsize="847299" vmpeak="847299" vmrss="120969" vmhwm="120969" />
+        <model path="tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml" test="infer_request_inference" device="GPU" vmsize="828920" vmpeak="901010" vmrss="475939" vmhwm="475939" />
+        <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="create_exenetwork" device="CPU" vmsize="760988" vmpeak="1018754" vmrss="14484" vmhwm="296612" />
+        <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="create_exenetwork" device="GPU" vmsize="600859" vmpeak="965967" vmrss="255569" vmhwm="685150" />
+        <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="infer_request_inference" device="CPU" vmsize="1095155" vmpeak="1167245" vmrss="304607" vmhwm="304607" />
+        <model path="tf/1.14.0/FP32/ncf/ncf.xml" test="infer_request_inference" device="GPU" vmsize="1004577" vmpeak="1076666" vmrss="651943" vmhwm="689915" />
+        <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="create_exenetwork" device="CPU" vmsize="756096" vmpeak="1100136" vmrss="27812" vmhwm="362344" />
+        <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="create_exenetwork" device="GPU" vmsize="822830" vmpeak="1073947" vmrss="477193" vmhwm="792264" />
+        <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="infer_request_inference" device="CPU" vmsize="1060571" vmpeak="1132661" vmrss="269808" vmhwm="362771" />
+        <model path="tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml" test="infer_request_inference" device="GPU" vmsize="1054684" vmpeak="1075272" vmrss="702310" vmhwm="794314" />
+        <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="CPU" vmsize="760764" vmpeak="1338383" vmrss="42706" vmhwm="617047" />
+        <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="create_exenetwork" device="GPU" vmsize="1108602" vmpeak="1561885" vmrss="762616" vmhwm="1279700" />
+        <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="CPU" vmsize="1279819" vmpeak="1338409" vmrss="435102" vmhwm="617865" />
+        <model path="tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml" test="infer_request_inference" device="GPU" vmsize="1455146" vmpeak="1561388" vmrss="1101755" vmhwm="1279845" />
+        <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="CPU" vmsize="765221" vmpeak="1552262" vmrss="59875" vmhwm="829250" />
+        <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="create_exenetwork" device="GPU" vmsize="1322098" vmpeak="1985359" vmrss="976223" vmhwm="1703319" />
+        <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="CPU" vmsize="1373006" vmpeak="1552293" vmrss="581891" vmhwm="829848" />
+        <model path="tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml" test="infer_request_inference" device="GPU" vmsize="1814348" vmpeak="1986380" vmrss="1461099" vmhwm="1704714" />
+        <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="CPU" vmsize="766088" vmpeak="1079958" vmrss="27324" vmhwm="362155" />
+        <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="create_exenetwork" device="GPU" vmsize="838965" vmpeak="1085884" vmrss="493407" vmhwm="804324" />
+        <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="CPU" vmsize="1046157" vmpeak="1118246" vmrss="260515" vmhwm="362810" />
+        <model path="tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml" test="infer_request_inference" device="GPU" vmsize="1057223" vmpeak="1080772" vmrss="704066" vmhwm="799440" />
+        <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="CPU" vmsize="761754" vmpeak="1365104" vmrss="45179" vmhwm="620879" />
+        <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="create_exenetwork" device="GPU" vmsize="1120737" vmpeak="1613546" vmrss="774637" vmhwm="1331308" />
+        <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="CPU" vmsize="1251346" vmpeak="1365135" vmrss="446415" vmhwm="620241" />
+        <model path="tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml" test="infer_request_inference" device="GPU" vmsize="1515817" vmpeak="1613858" vmrss="1162572" vmhwm="1331968" />
+        <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="CPU" vmsize="839823" vmpeak="1569361" vmrss="155029" vmhwm="833157" />
+        <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="create_exenetwork" device="GPU" vmsize="1363960" vmpeak="2068752" vmrss="1018507" vmhwm="1787042" />
+        <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="CPU" vmsize="1476041" vmpeak="1569392" vmrss="679918" vmhwm="833914" />
+        <model path="tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml" test="infer_request_inference" device="GPU" vmsize="1904799" vmpeak="2060317" vmrss="1551756" vmhwm="1778167" />
+        <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="create_exenetwork" device="CPU" vmsize="756602" vmpeak="1096774" vmrss="28393" vmhwm="363391" />
+        <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="create_exenetwork" device="GPU" vmsize="845226" vmpeak="1103374" vmrss="500051" vmhwm="821986" />
+        <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="infer_request_inference" device="CPU" vmsize="1063304" vmpeak="1135393" vmrss="271220" vmhwm="364399" />
+        <model path="tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml" test="infer_request_inference" device="GPU" vmsize="1092159" vmpeak="1105997" vmrss="738276" vmhwm="823983" />
+        <model path="tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml" test="create_exenetwork" device="CPU" vmsize="838816" vmpeak="1561762" vmrss="116930" vmhwm="752906" />
+        <model path="tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml" test="create_exenetwork" device="GPU" vmsize="1674490" vmpeak="2318250" vmrss="1329842" vmhwm="2034986" />
+        <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="CPU" vmsize="755062" vmpeak="880739" vmrss="28415" vmhwm="43480" />
+        <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="create_exenetwork" device="GPU" vmsize="609298" vmpeak="704044" vmrss="263868" vmhwm="323488" />
+        <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="CPU" vmsize="825048" vmpeak="897138" vmrss="49108" vmhwm="49108" />
+        <model path="tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml" test="infer_request_inference" device="GPU" vmsize="675844" vmpeak="747934" vmrss="322753" vmhwm="322753" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="create_exenetwork" device="CPU" vmsize="756804" vmpeak="978252" vmrss="70514" vmhwm="120370" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="create_exenetwork" device="GPU" vmsize="831318" vmpeak="949744" vmrss="485619" vmhwm="524550" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="infer_request_inference" device="CPU" vmsize="925689" vmpeak="997779" vmrss="130244" vmhwm="130244" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml" test="infer_request_inference" device="GPU" vmsize="878099" vmpeak="950188" vmrss="525395" vmhwm="525395" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="create_exenetwork" device="CPU" vmsize="759435" vmpeak="1442861" vmrss="34680" vmhwm="509454" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="create_exenetwork" device="GPU" vmsize="1012906" vmpeak="1460487" vmrss="667977" vmhwm="1179833" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="infer_request_inference" device="CPU" vmsize="1368043" vmpeak="1442861" vmrss="427737" vmhwm="509533" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml" test="infer_request_inference" device="GPU" vmsize="1542648" vmpeak="1542648" vmrss="1195304" vmhwm="1195304" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="create_exenetwork" device="CPU" vmsize="759558" vmpeak="1426185" vmrss="33862" vmhwm="507768" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="create_exenetwork" device="GPU" vmsize="1010358" vmpeak="1414454" vmrss="665451" vmhwm="1133941" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="infer_request_inference" device="CPU" vmsize="1350650" vmpeak="1426185" vmrss="421828" vmhwm="509168" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml" test="infer_request_inference" device="GPU" vmsize="1493681" vmpeak="1565770" vmrss="1145416" vmhwm="1145416" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="create_exenetwork" device="CPU" vmsize="761433" vmpeak="985784" vmrss="41514" vmhwm="254610" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="create_exenetwork" device="GPU" vmsize="876933" vmpeak="1078919" vmrss="531814" vmhwm="798001" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="infer_request_inference" device="CPU" vmsize="1028508" vmpeak="1064698" vmrss="201212" vmhwm="254390" />
+        <model path="tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml" test="infer_request_inference" device="GPU" vmsize="1091807" vmpeak="1163896" vmrss="739525" vmhwm="798023" />
+        <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="create_exenetwork" device="CPU" vmsize="754067" vmpeak="1169247" vmrss="15686" vmhwm="429523" />
+        <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="create_exenetwork" device="GPU" vmsize="682413" vmpeak="1130109" vmrss="337194" vmhwm="848733" />
+        <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="infer_request_inference" device="CPU" vmsize="1106463" vmpeak="1178553" vmrss="321428" vmhwm="429871" />
+        <model path="tf/1.14.0/FP32/unet2d/unet2d.xml" test="infer_request_inference" device="GPU" vmsize="1083904" vmpeak="1155994" vmrss="730976" vmhwm="845882" />
+        <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754010" vmpeak="2548502" vmrss="15452" vmhwm="1807863" />
+        <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="686602" vmpeak="3327385" vmrss="340982" vmhwm="3045398" />
+        <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2026776" vmpeak="2548502" vmrss="1241011" vmhwm="1808730" />
+        <model path="tf/1.14.0/FP32/vgg16/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2438568" vmpeak="3312188" vmrss="2084328" vmhwm="3029980" />
+        <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="CPU" vmsize="754168" vmpeak="2617986" vmrss="16073" vmhwm="1877000" />
+        <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="create_exenetwork" device="GPU" vmsize="612194" vmpeak="3415310" vmrss="266732" vmhwm="3133363" />
+        <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="CPU" vmsize="2145479" vmpeak="2617885" vmrss="1287272" vmhwm="1877568" />
+        <model path="tf/1.14.0/FP32/vgg19/vgg19.xml" test="infer_request_inference" device="GPU" vmsize="2521367" vmpeak="3415297" vmrss="2167426" vmhwm="3133059" />
+        <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1426625" vmrss="17173" vmhwm="684173" />
+        <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="create_exenetwork" device="GPU" vmsize="684424" vmpeak="1460949" vmrss="339600" vmhwm="1180036" />
+        <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="CPU" vmsize="1282802" vmpeak="1426625" vmrss="493737" vmhwm="684802" />
+        <model path="tf/1.14.0/FP32/yolo_v2/yolo_v2.xml" test="infer_request_inference" device="GPU" vmsize="1331783" vmpeak="1443006" vmrss="978560" vmhwm="1161124" />
+        <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="create_exenetwork" device="CPU" vmsize="753724" vmpeak="954421" vmrss="14414" vmhwm="229578" />
+        <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="create_exenetwork" device="GPU" vmsize="569179" vmpeak="816648" vmrss="224250" vmhwm="535449" />
+        <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="infer_request_inference" device="CPU" vmsize="960810" vmpeak="960810" vmrss="174231" vmhwm="229807" />
+        <model path="tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml" test="infer_request_inference" device="GPU" vmsize="808627" vmpeak="880717" vmrss="455677" vmhwm="533002" />
+        <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="create_exenetwork" device="CPU" vmsize="754344" vmpeak="1422647" vmrss="17437" vmhwm="680666" />
+        <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="create_exenetwork" device="GPU" vmsize="686316" vmpeak="1436296" vmrss="340586" vmhwm="1154617" />
+        <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="infer_request_inference" device="CPU" vmsize="1279797" vmpeak="1422616" vmrss="490982" vmhwm="680147" />
+        <model path="tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml" test="infer_request_inference" device="GPU" vmsize="1330780" vmpeak="1442570" vmrss="978392" vmhwm="1161490" />
+        <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="CPU" vmsize="756958" vmpeak="1587260" vmrss="31108" vmhwm="836506" />
+        <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="create_exenetwork" device="GPU" vmsize="1163712" vmpeak="1824596" vmrss="819011" vmhwm="1543559" />
+        <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="CPU" vmsize="1405879" vmpeak="1591766" vmrss="610302" vmhwm="836594" />
+        <model path="tf/1.14.0/FP32/yolo_v3/yolo_v3.xml" test="infer_request_inference" device="GPU" vmsize="1734233" vmpeak="1823470" vmrss="1381925" vmhwm="1542178" />
+        <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="create_exenetwork" device="CPU" vmsize="753975" vmpeak="895633" vmrss="15637" vmhwm="140927" />
+        <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="create_exenetwork" device="GPU" vmsize="599332" vmpeak="728939" vmrss="254029" vmhwm="412566" />
+        <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="infer_request_inference" device="CPU" vmsize="903469" vmpeak="975559" vmrss="116124" vmhwm="141182" />
+        <model path="tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml" test="infer_request_inference" device="GPU" vmsize="741738" vmpeak="813828" vmrss="389259" vmhwm="413476" />
+    </models>
+</attributes>
\ No newline at end of file
diff --git a/tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_test_config.xml
new file mode 100644 (file)
index 0000000..1a50d72
--- /dev/null
@@ -0,0 +1,156 @@
+<?xml version="1.0"?>
+<attributes>
+    <devices>
+        <value>CPU</value>
+        <value>GPU</value>
+    </devices>
+    <models>
+        <value>caffe/FP32/alexnet/alexnet.xml</value>
+        <value>caffe/FP32/caffenet/caffenet.xml</value>
+        <value>caffe/FP32/densenet_121/densenet_121.xml</value>
+        <value>caffe/FP32/densenet_161/densenet_161.xml</value>
+        <value>caffe/FP32/densenet_169/densenet_169.xml</value>
+        <value>caffe/FP32/densenet_201/densenet_201.xml</value>
+        <value>caffe/FP32/dpn_92/dpn_92.xml</value>
+        <value>caffe/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+        <value>caffe/FP32/inception_v1/inception_v1.xml</value>
+        <value>caffe/FP32/inception_v2/inception_v2.xml</value>
+        <value>caffe/FP32/inception_v3/inception_v3.xml</value>
+        <value>caffe/FP32/inception_v4/inception_v4.xml</value>
+        <value>caffe/FP32/lenet/lenet.xml</value>
+        <value>caffe/FP32/mobilenet/mobilenet.xml</value>
+        <value>caffe/FP32/mobilenet_v2/mobilenet_v2.xml</value>
+        <value>caffe/FP32/resnet_18/resnet_18.xml</value>
+        <value>caffe/FP32/resnet_v1_50/resnet_v1_50.xml</value>
+        <value>caffe/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+        <value>caffe/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+        <value>caffe/FP32/resnet_v1_269/resnet_v1_269.xml</value>
+        <value>caffe/FP32/se_resnext_50/se_resnext_50.xml</value>
+        <value>caffe/FP32/squeezenet_v1.0/squeezenet_v1.0.xml</value>
+        <value>caffe/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+        <value>caffe/FP32/ssd_googlenet/ssd_googlenet.xml</value>
+        <value>caffe/FP32/ssd_squeezenet/ssd_squeezenet.xml</value>
+        <value>caffe/FP32/ssd_mobilenet/ssd_mobilenet.xml</value>
+        <value>caffe/FP32/ssd_vgg16_300/ssd_vgg16_300.xml</value>
+        <value>caffe/FP32/ssd_vgg16_512/ssd_vgg16_512.xml</value>
+        <value>caffe/FP32/vgg16/vgg16.xml</value>
+        <value>caffe/FP32/vgg19/vgg19.xml</value>
+        <value>caffe/FP32/wrn_50_2/wrn_50_2.xml</value>
+        <value>caffe/FP32/yolo_v1_full/yolo_v1_full.xml</value>
+        <value>caffe/FP32/yolo_v1_tiny/yolo_v1_tiny.xml</value>
+        <value>caffe/FP32/yolo_v2/yolo_v2.xml</value>
+        <value>caffe/FP32/yolo_v2_tiny/yolo_v2_tiny.xml</value>
+        <value>caffe/FP32/yolo_v3/yolo_v3.xml</value>
+        <value>caffe/FP32/dilation/dilation.xml</value>
+        <value>caffe/FP32/dssd/dssd.xml</value>
+        <value>caffe/FP32/fcn8/fcn8.xml</value>
+        <value>caffe/FP32/fcn32/fcn32.xml</value>
+        <value>caffe/FP32/fcn_alexnet/fcn_alexnet.xml</value>
+        <value>caffe/FP32/mtcnn_p/mtcnn_p.xml</value>
+        <value>caffe/FP32/mtcnn_r/mtcnn_r.xml</value>
+        <value>caffe/FP32/mtcnn_o/mtcnn_o.xml</value>
+        <value>caffe/FP32/openpose_face/openpose_face.xml</value>
+        <value>caffe/FP32/openpose_hand/openpose_hand.xml</value>
+        <value>caffe/FP32/openpose_pose_coco/openpose_pose_coco.xml</value>
+        <value>caffe/FP32/places205_alexnet/places205_alexnet.xml</value>
+        <value>caffe/FP32/places205_googlenet/places205_googlenet.xml</value>
+        <value>caffe/FP32/se_bn_inception/se_bn_inception.xml</value>
+        <value>caffe/FP32/vnect/vnect.xml</value>
+        <value>tf/1.14.0/FP32/bert_base_uncased/bert_base_uncased.xml</value>
+        <value>tf/1.14.0/FP32/bert_xnli/bert_xnli.xml</value>
+        <value>tf/1.14.0/FP32/cmu/cmu.xml</value>
+        <value>tf/1.14.0/FP32/densenet_121/densenet_121.xml</value>
+        <value>tf/1.14.0/FP32/densenet_169/densenet_169.xml</value>
+        <value>tf/1.14.0/FP32/deeplab_v3/deeplab_v3.xml</value>
+        <value>tf/1.14.0/FP32/east/east.xml</value>
+        <value>tf/1.14.0/FP32/facenet/facenet.xml</value>
+        <value>tf/1.14.0/FP32/faster_rcnn_inception_v2_coco/faster_rcnn_inception_v2_coco.xml</value>
+        <value>tf/1.14.0/FP32/faster_rcnn_inception_resnet_v2_atrous_coco/faster_rcnn_inception_resnet_v2_atrous_coco.xml</value>
+        <value>tf/1.14.0/FP32/faster_rcnn_resnet50_coco/faster_rcnn_resnet50_coco.xml</value>
+        <value>tf/1.14.0/FP32/faster_rcnn_resnet101_coco/faster_rcnn_resnet101_coco.xml</value>
+        <value>tf/1.14.0/FP32/gnmt/gnmt.xml</value>
+        <value>tf/1.14.0/FP32/i3d_rgb/i3d_rgb.xml</value>
+        <value>tf/1.14.0/FP32/inception_v1/inception_v1.xml</value>
+        <value>tf/1.14.0/FP32/inception_v2/inception_v2.xml</value>
+        <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+        <value>tf/1.14.0/FP32/inception_v4/inception_v4.xml</value>
+        <value>tf/1.14.0/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+        <value>tf/1.14.0/FP32/mask_rcnn_resnet101_atrous_coco/mask_rcnn_resnet101_atrous_coco.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v1_0.25_128/mobilenet_v1_0.25_128.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v1_0.5_160/mobilenet_v1_0.5_160.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v1_1.0_224/mobilenet_v1_1.0_224.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v2_1.0_224/mobilenet_v2_1.0_224.xml</value>
+        <value>tf/1.14.0/FP32/mobilenet_v2_1.4_224/mobilenet_v2_1.4_224.xml</value>
+        <value>tf/1.14.0/FP32/ncf/ncf.xml</value>
+        <value>tf/1.14.0/FP32/nasnet-a_large/nasnet-a_large.xml</value>
+        <value>tf/1.14.0/FP32/nasnet-a_mobile/nasnet-a_mobile.xml</value>
+        <value>tf/1.14.0/FP32/pnasnet-5_large/pnasnet-5_large.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v1_50/resnet_v1_50.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v1.5_50/resnet_v1.5_50.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v2_50/resnet_v2_50.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v2_101/resnet_v2_101.xml</value>
+        <value>tf/1.14.0/FP32/resnet_v2_152/resnet_v2_152.xml</value>
+        <value>tf/1.14.0/FP32/rfcn_resnet101_coco/rfcn_resnet101_coco.xml</value>
+        <value>tf/1.14.0/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco/ssd_mobilenet_v1_fpn_coco.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_fpn_coco_602x602/ssd_mobilenet_v1_fpn_coco_602x602.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v2_coco/ssd_mobilenet_v2_coco.xml</value>
+        <value>tf/1.14.0/FP32/unet2d/unet2d.xml</value>
+        <value>tf/1.14.0/FP32/vgg16/vgg16.xml</value>
+        <value>tf/1.14.0/FP32/vgg19/vgg19.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v2/yolo_v2.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v2_voc/yolo_v2_voc.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v2_tiny_voc/yolo_v2_tiny_voc.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v3/yolo_v3.xml</value>
+        <value>tf/1.14.0/FP32/yolo_v3_tiny/yolo_v3_tiny.xml</value>
+        <value>tf/1.14.0/FP32/dssd_avigilon/dssd_avigilon.xml</value>
+        <value>tf/1.14.0/FP32/icv_squeezenet_v1.0/icv_squeezenet_v1.0.xml</value>
+        <value>tf/1.14.0/FP32/icv_squeezenet_v1.1/icv_squeezenet_v1.1.xml</value>
+        <value>mxnet/FP32/caffenet/caffenet.xml</value>
+        <value>mxnet/FP32/densenet_121/densenet_121.xml</value>
+        <value>mxnet/FP32/densenet_161/densenet_161.xml</value>
+        <value>mxnet/FP32/densenet_169/densenet_169.xml</value>
+        <value>mxnet/FP32/densenet_201/densenet_201.xml</value>
+        <value>mxnet/FP32/inception_v3/inception_v3.xml</value>
+        <value>mxnet/FP32/inception_v4/inception_v4.xml</value>
+        <value>mxnet/FP32/mobilenet/mobilenet.xml</value>
+        <value>mxnet/FP32/mobilenet_v2/mobilenet_v2.xml</value>
+        <value>mxnet/FP32/resnet_v1_101/resnet_v1_101.xml</value>
+        <value>mxnet/FP32/resnet_v1_152/resnet_v1_152.xml</value>
+        <value>mxnet/FP32/resnet_v2_101/resnet_v2_101.xml</value>
+        <value>mxnet/FP32/resnet_v2_152/resnet_v2_152.xml</value>
+        <value>mxnet/FP32/resnext_101/resnext_101.xml</value>
+        <value>mxnet/FP32/squeezenet_v1.1/squeezenet_v1.1.xml</value>
+        <value>mxnet/FP32/ssd_inception_v3_512/ssd_inception_v3_512.xml</value>
+        <value>mxnet/FP32/ssd_mobilenet_512/ssd_mobilenet_512.xml</value>
+        <value>mxnet/FP32/ssd_resnet50_512/ssd_resnet50_512.xml</value>
+        <value>mxnet/FP32/ssd_vgg16_300/ssd_vgg16_300.xml</value>
+        <value>mxnet/FP32/vgg16/vgg16.xml</value>
+        <value>mxnet/FP32/vgg19/vgg19.xml</value>
+        <value>mxnet/FP32/dpn_92/dpn_92.xml</value>
+        <value>mxnet/FP32/fcn8s_vgg16/fcn8s_vgg16.xml</value>
+        <value>mxnet/FP32/full_imagenet_network/full_imagenet_network.xml</value>
+        <value>mxnet/FP32/inception_resnet_v2/inception_resnet_v2.xml</value>
+        <value>mxnet/FP32/inception_v3_no_batchnorm/inception_v3_no_batchnorm.xml</value>
+        <value>mxnet/FP32/location_net/location_net.xml</value>
+        <value>mxnet/FP32/lresnet100e/lresnet100e.xml</value>
+        <value>mxnet/FP32/mtcnn_p/mtcnn_p.xml</value>
+        <value>mxnet/FP32/mtcnn_r/mtcnn_r.xml</value>
+        <value>mxnet/FP32/mtcnn_o/mtcnn_o.xml</value>
+        <value>mxnet/FP32/nin/nin.xml</value>
+        <value>mxnet/FP32/nst_vgg19/nst_vgg19.xml</value>
+        <value>mxnet/FP32/resnext_101_64x4d/resnext_101_64x4d.xml</value>
+        <value>mxnet/FP32/yolo_v1_full/yolo_v1_full.xml</value>
+        <value>mxnet/FP32/yolo_v1_tiny/yolo_v1_tiny.xml</value>
+        <value>onnx/FP32/ssd_resnet34/ssd_resnet34.xml</value>
+        <value>onnx/FP32/ssd_resnet34_new/ssd_resnet34_new.xml</value>
+        <value>onnx/FP32/retina_net/retina_net.xml</value>
+        <value>pytorch/FP32/inceptionv3_pretrained/inceptionv3_pretrained.xml</value>
+        <value>pytorch/FP32/resnet50_pretrained/resnet50_pretrained.xml</value>
+        <value>pytorch/FP32/squeezenet_v1.1_pretrained/squeezenet_v1.1_pretrained.xml</value>
+        <value>pytorch/FP32/resnet50_torchvision/resnet50_torchvision.xml</value>
+    </models>
+</attributes>
\ No newline at end of file
diff --git a/tests/stress_tests/.automation/memleaks_tests/nightly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/memleaks_tests/nightly_configs/desktop_test_config.xml
new file mode 100644 (file)
index 0000000..12e95b4
--- /dev/null
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<attributes>
+<!--[ WARNING ] Use of attribute "processes" from config isn't implemented yet. It will be ignored.-->
+    <processes>
+        <value>1</value>
+    </processes>
+    <threads>
+        <value>1</value>
+    </threads>
+    <iterations>
+        <value>1000</value>
+    </iterations>
+    <devices>
+        <value>CPU</value>
+        <value>GPU</value>
+    </devices>
+    <models>
+        <value>caffe/FP32/alexnet/alexnet.xml</value>
+        <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+    </models>
+</attributes>
diff --git a/tests/stress_tests/.automation/memleaks_tests/weekly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/memleaks_tests/weekly_configs/desktop_test_config.xml
new file mode 100644 (file)
index 0000000..74ddd50
--- /dev/null
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<attributes>
+<!--[ WARNING ] Use of attribute "processes" from config isn't implemented yet. It will be ignored.-->
+    <processes>
+        <value>1</value>
+    </processes>
+    <threads>
+        <value>1</value>
+    </threads>
+    <iterations>
+        <value>10000</value>
+    </iterations>
+    <devices>
+        <value>CPU</value>
+        <value>GPU</value>
+    </devices>
+    <models>
+        <value>caffe/FP32/alexnet/alexnet.xml</value>
+        <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+    </models>
+</attributes>
diff --git a/tests/stress_tests/.automation/unittests/nightly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/unittests/nightly_configs/desktop_test_config.xml
new file mode 100644 (file)
index 0000000..0fb2428
--- /dev/null
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<attributes>
+    <processes>
+        <value>1</value>
+    </processes>
+    <threads>
+        <value>1</value>
+        <value>2</value>
+    </threads>
+    <iterations>
+        <value>100</value>
+    </iterations>
+    <devices>
+        <value>CPU</value>
+        <value>GPU</value>
+    </devices>
+    <models>
+        <value>caffe/FP32/alexnet/alexnet.xml</value>
+        <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+    </models>
+</attributes>
diff --git a/tests/stress_tests/.automation/unittests/weekly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/unittests/weekly_configs/desktop_test_config.xml
new file mode 100644 (file)
index 0000000..6c94f40
--- /dev/null
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<attributes>
+    <processes>
+        <value>1</value>
+        <value>2</value>
+    </processes>
+    <threads>
+        <value>1</value>
+        <value>2</value>
+    </threads>
+    <iterations>
+        <value>1000</value>
+    </iterations>
+    <devices>
+        <value>CPU</value>
+        <value>GPU</value>
+    </devices>
+    <models>
+        <value>caffe/FP32/alexnet/alexnet.xml</value>
+        <value>tf/1.14.0/FP32/inception_v3/inception_v3.xml</value>
+        <value>tf/1.14.0/FP32/ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco.xml</value>
+    </models>
+</attributes>
diff --git a/tests/stress_tests/CMakeLists.txt b/tests/stress_tests/CMakeLists.txt
new file mode 100644 (file)
index 0000000..350abba
--- /dev/null
@@ -0,0 +1,25 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+if(ENABLE_DOCKER)
+    cmake_minimum_required(VERSION 3.3 FATAL_ERROR)
+else()
+    if (APPLE)
+        # due to https://cmake.org/cmake/help/v3.12/policy/CMP0068.html
+        cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
+    else()
+        cmake_minimum_required(VERSION 3.7.2 FATAL_ERROR)
+    endif()
+endif()
+
+if (CMAKE_BUILD_TYPE STREQUAL "")
+    message(STATUS "CMAKE_BUILD_TYPE not defined, 'Release' will be used")
+    set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+find_package(InferenceEngineDeveloperPackage REQUIRED)
+
+add_subdirectory(unittests)
+add_subdirectory(memleaks_tests)
+add_subdirectory(memcheck_tests)
diff --git a/tests/stress_tests/README.md b/tests/stress_tests/README.md
new file mode 100644 (file)
index 0000000..509bd53
--- /dev/null
@@ -0,0 +1,86 @@
+# Stress Tests Suite
+
+This test suite contains tests evaluating the behavior of various OpenVINO use
+cases under stress conditions:
+
+- MemCheckTests measuring memory required for the use cases and fail when memory
+usage exceeds a pre-defined level.
+
+- StressMemLeaksTests ensure that the use cases does not increase memory levels
+when executing continuously.
+
+- StressUnitTests executing various Inference Engine use cases in parallel
+threads and processes.
+
+Each test refers to configuration files located in `<test dir>\local_configs`
+folder. The configuration files are installed along with tests on build time.
+
+## Getting Started
+
+Stress tests are based on the googletest framework. You can filter tests with
+`--gtest_filter` and explore tests available with `--gtest_list_tests` options.
+
+Tests measuring memory have a temporary limitation - those need to be executed
+one at a time to mitigate memory statistics pollution. You can use
+[gtest-parallel][gtest-parallel] for massive tests execution.
+
+### Pre-requisites
+
+- Linux OS to build the tests.
+
+- [gtest-parallel][gtest-parallel] to execute tests.
+
+### Building Tests
+
+Stress tests should be built in 2 steps.
+
+1. Build `dldt`
+
+Build `dldt` as usual but with `-DENABLE_TESTS=ON`.
+
+2. Build `stress_tests`
+
+Stress tests depend from the Inference Engine Developer Package located in the
+`dldt` build directory.
+
+In the command line snippet bellow, it is assumed that the Inference Engine
+Developer Package CMake module can be found in the directory `build` under
+`dldt` repository root.
+
+``` bash
+(
+export DLDT_BUILD_DIR=$(git rev-parse --show-toplevel)/build
+mkdir -p build && cd build && \
+cmake -DInferenceEngineDeveloperPackage_DIR=$DLDT_BUILD_DIR .. && make -j$(nproc) \
+)
+```
+
+### Preparing Test Data
+
+Stress test use models from [Open Model Zoo][open_model_zoo]. Download and
+convert models to IRs using `./scripts/get_testdata.py` script.
+
+From Intel network you can use models from cache at `vdp_tests` file share.
+Refer to [VDP shared folders][VDP-shared-folders] on using file shares.
+
+### Running Tests
+
+``` bash
+gtest-parallel ./MemCheckTests
+```
+
+``` bash
+gtest-parallel ./StressMemLeaksTests
+```
+MemCheckTests logs can be used to gather reference values based on current
+memory consumption:
+
+``` bash
+mkdir -p MemCheckTests-logs && \
+gtest-parallel -d ./MemCheckTests-logs ./MemCheckTests && \
+grep -rh ./MemCheckTests-logs -e ".*<model " | sed -e "s/.*<model /<model /" | sort
+```
+
+[VDP-shared-folders]: https://wiki.ith.intel.com/display/DLSDK/VDP+shared+folders
+[gtest-parallel]: https://github.com/google/gtest-parallel
+[open_model_zoo]: https://github.com/opencv/open_model_zoo
\ No newline at end of file
diff --git a/tests/stress_tests/common/ie_pipelines/pipelines.cpp b/tests/stress_tests/common/ie_pipelines/pipelines.cpp
new file mode 100644 (file)
index 0000000..c4d5e42
--- /dev/null
@@ -0,0 +1,136 @@
+#include "pipelines.h"
+#include "../utils.h"
+
+#include <iostream>
+#include <string>
+
+#include <inference_engine.hpp>
+
+using namespace InferenceEngine;
+
+std::function<void()> load_unload_plugin(const std::string &target_device) {
+    return [&] {
+        Core ie;
+        // GetVersions silently register plugin in `plugins` through `GetCPPPluginByName`
+        ie.GetVersions(target_device);
+        // Remove plugin for target_device from `plugins`
+        ie.UnregisterPlugin(target_device);
+    };
+}
+
+std::function<void()> read_network(const std::string &model) {
+    return [&] {
+        IE_SUPPRESS_DEPRECATED_START
+        CNNNetReader netReader;
+        netReader.ReadNetwork(model);
+        netReader.ReadWeights(fileNameNoExt(model) + ".bin");
+        IE_SUPPRESS_DEPRECATED_END
+    };
+}
+
+std::function<void()> create_cnnnetwork(const std::string &model) {
+    return [&] {
+        Core ie;
+        CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+    };
+}
+
+std::function<void()> cnnnetwork_reshape_batch_x2(const std::string &model) {
+    return [&] {
+        Core ie;
+        CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+        const InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+        ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+        bool doReshape = false;
+        for (const InputsDataMap::value_type& input : inputInfo) {
+            int batchIndex = -1;
+            auto layout = input.second->getTensorDesc().getLayout();
+            if ((layout == Layout::NCHW) || (layout == Layout::NCDHW) ||
+                (layout == Layout::NHWC) || (layout == Layout::NDHWC) ||
+                (layout == Layout::NC)) {
+                batchIndex = 0;
+            } else if (layout == CN) {
+                batchIndex = 1;
+            }
+            if (batchIndex != -1) {
+                shapes[input.first][batchIndex] *= 2;
+                doReshape = true;
+            }
+        }
+        if (doReshape)
+            cnnNetwork.reshape(shapes);
+        else
+            throw std::logic_error("Reshape wasn't applied for a model.");
+    };
+}
+
+std::function<void()> set_input_params(const std::string &model) {
+    return [&] {
+        Core ie;
+        CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+        InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+        for (auto &input : inputInfo) {
+            input.second->getPreProcess().setResizeAlgorithm(NO_RESIZE);
+            input.second->setPrecision(Precision::U8);
+            if (input.second->getInputData()->getTensorDesc().getDims().size() == 4)
+                input.second->setLayout(Layout::NCHW);
+            else if (input.second->getInputData()->getTensorDesc().getDims().size() == 2)
+                input.second->setLayout(Layout::NC);
+            else
+                throw std::logic_error("Setting of input parameters wasn't applied for a model.");
+        }
+    };
+}
+
+std::function<void()> create_exenetwork(const std::string &model, const std::string &target_device) {
+    return [&] {
+        Core ie;
+        CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+        ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    };
+}
+
+std::function<void()> recreate_exenetwork(Core &ie, const std::string &model, const std::string &target_device) {
+    return [&] {
+        CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+        ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    };
+}
+
+std::function<void()> create_infer_request(const std::string &model, const std::string &target_device) {
+    return [&] {
+        Core ie;
+        CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+        ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+        InferRequest infer_request = exeNetwork.CreateInferRequest();
+    };
+}
+
+
+std::function<void()> recreate_infer_request(InferenceEngine::ExecutableNetwork& exeNetwork) {
+    return [&] {
+        InferRequest infer_request = exeNetwork.CreateInferRequest();
+    };
+}
+
+std::function<void()> infer_request_inference(const std::string &model, const std::string &target_device) {
+    return [&] {
+        Core ie;
+        CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+        ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+        InferRequest infer_request = exeNetwork.CreateInferRequest();
+        infer_request.Infer();
+        OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+        for (auto &output : output_info)
+            Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+    };
+}
+
+std::function<void()> reinfer_request_inference(InferenceEngine::InferRequest& infer_request, InferenceEngine::CNNNetwork& cnnNetwork) {
+    return [&] {
+        infer_request.Infer();
+        OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+        for (auto &output : output_info)
+            Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+    };
+}
diff --git a/tests/stress_tests/common/ie_pipelines/pipelines.h b/tests/stress_tests/common/ie_pipelines/pipelines.h
new file mode 100644 (file)
index 0000000..aaac1c3
--- /dev/null
@@ -0,0 +1,16 @@
+#include <string>
+#include <functional>
+#include <inference_engine.hpp>
+
+std::function<void()> load_unload_plugin(const std::string &target_device);
+std::function<void()> read_network(const std::string &model);
+std::function<void()> create_cnnnetwork(const std::string &model);
+std::function<void()> cnnnetwork_reshape_batch_x2(const std::string &model);
+std::function<void()> set_input_params(const std::string &model);
+std::function<void()> create_exenetwork(const std::string &model, const std::string &target_device);
+std::function<void()> recreate_exenetwork(InferenceEngine::Core &ie, const std::string &model, const std::string &target_device);
+std::function<void()> create_infer_request(const std::string &model, const std::string &target_device);
+std::function<void()> recreate_infer_request(InferenceEngine::ExecutableNetwork& exeNetwork);
+std::function<void()> infer_request_inference(const std::string &model, const std::string &target_device);
+std::function<void()> infer_request_inference(const std::string &model, const std::string &target_device);
+std::function<void()> reinfer_request_inference(InferenceEngine::InferRequest& infer_request, InferenceEngine::CNNNetwork& cnnNetwork);
diff --git a/tests/stress_tests/common/managers/task_manager.h b/tests/stress_tests/common/managers/task_manager.h
new file mode 100644 (file)
index 0000000..797432e
--- /dev/null
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <functional>
+
+#include "../tests_utils.h"
+
+enum ManagerStatus {
+    NOT_STARTED = -2,
+    NOT_FINISHED = -1,
+    FINISHED_SUCCESSFULLY = 0,
+    FINISHED_UNEXPECTEDLY
+};
+
+template<typename Type>
+using Task = std::pair<ManagerStatus, std::function<Type()>>;
+
+template<typename Type>
+class TaskManager {
+public:
+    std::vector<Task<Type>> tasks;
+    std::vector<Type> tasks_results;
+
+    TaskManager() {}
+
+    TaskManager(const std::initializer_list<std::function<Type()>> &tasks_list) {
+        tasks.reserve(tasks_list.size());
+        for (const auto &task : tasks_list)
+            add_task(task);
+    }
+
+    void add_task(const std::function<Type()> &task) {
+        auto _task = Task<Type>(ManagerStatus::NOT_STARTED, task);
+        tasks.push_back(_task);
+    }
+
+    void run_sequentially() {
+        // TODO: make it asynchronous
+        tasks_results.reserve(tasks.size());
+        for (auto task : tasks) {
+            task.first = ManagerStatus::NOT_FINISHED;
+            tasks_results.push_back(task.second());
+        }
+    }
+
+    void run_parallel_n_wait() {
+        run_parallel();
+        wait_all();
+    }
+
+    void wait_all() {
+        int numtasks = tasks.size();
+        for (int i = 0; i < numtasks; i++)
+            if (tasks[i].first == ManagerStatus::NOT_FINISHED)
+                wait_task(i);
+    }
+
+    std::vector<ManagerStatus> get_all_statuses() {
+        std::vector<ManagerStatus> statuses;
+
+        int numtasks = tasks.size();
+        for (int i = 0; i < numtasks; i++)
+            statuses.push_back(get_task_status(i));
+        return statuses;
+    }
+
+    std::vector<TestResult> get_all_results() {
+        return tasks_results;
+    }
+
+    TestResult get_task_result(int task_index) {
+        if (tasks_results.empty() ||
+            tasks_results.size() < task_index ||
+            task_index < 0)
+            throw std::out_of_range("Task index " + std::to_string(task_index) + " out of number of tasks");
+
+        return tasks_results[task_index];
+    }
+
+    virtual void run_parallel() = 0;
+
+    virtual void wait_task(int task_index) = 0; // TODO: implement for run_sequentially
+
+    virtual ManagerStatus get_task_status(int task_index) = 0;
+
+};
\ No newline at end of file
diff --git a/tests/stress_tests/common/managers/thread_manager.h b/tests/stress_tests/common/managers/thread_manager.h
new file mode 100644 (file)
index 0000000..9157899
--- /dev/null
@@ -0,0 +1,51 @@
+#include "task_manager.h"
+
+#include <future>
+
+template <typename Type>
+class ThreadManager : public TaskManager<Type> {
+public:
+    using TaskManager<Type>::tasks;
+    using TaskManager<Type>::tasks_results;
+    std::vector<std::future<TestResult>> threads;
+
+    using TaskManager<Type>::TaskManager;
+
+    void run_parallel() final {
+        // TODO: implement run_task function according to wait_task
+        int numtasks = tasks.size();
+        threads.reserve(numtasks);
+        tasks_results.reserve(numtasks);
+
+        for (int i = 0; i < numtasks; i++)
+            if (tasks[i].first == ManagerStatus::NOT_STARTED) {
+                tasks[i].first = ManagerStatus::NOT_FINISHED;
+                threads.push_back(std::async(std::launch::async, tasks[i].second));
+            }
+    }
+
+    void wait_task(int task_index) final {
+        if (threads.empty() ||
+            threads.size() < task_index ||
+            task_index < 0)
+            throw std::out_of_range("Task index " + std::to_string(task_index) + " out of number of tasks");
+
+        try {
+            tasks_results.push_back(threads[task_index].get());
+            tasks[task_index].first = ManagerStatus::FINISHED_SUCCESSFULLY;
+        } catch (std::exception &err) { // TODO: catch any exception
+            std::exception_ptr p = std::current_exception();
+            tasks[task_index].first = ManagerStatus::FINISHED_UNEXPECTEDLY;
+            tasks_results.push_back(TestResult(TestStatus::TEST_FAILED, "Test finished unexpectedly: " + (std::string)err.what()));
+        }
+    }
+
+    ManagerStatus get_task_status(int task_index) final {
+        if (threads.empty() ||
+            threads.size() < task_index ||
+            task_index < 0)
+            throw std::out_of_range("Task index " + std::to_string(task_index) + " out of number of tasks");
+
+        return tasks[task_index].first;
+    }
+};
\ No newline at end of file
diff --git a/tests/stress_tests/common/tests_utils.cpp b/tests/stress_tests/common/tests_utils.cpp
new file mode 100644 (file)
index 0000000..ed91e0f
--- /dev/null
@@ -0,0 +1,99 @@
+#include "tests_utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+#include <string>
+
+#define DEBUG_MODE false
+
+const pugi::xml_document & Environment::getTestConfig() {
+    return _test_config;
+}
+
+void Environment::setTestConfig(const pugi::xml_document &test_config) {
+    _test_config.reset(test_config);
+}
+
+const pugi::xml_document & Environment::getEnvConfig() {
+    return _env_config;
+}
+
+void Environment::setEnvConfig(const pugi::xml_document &env_config) {
+    _env_config.reset(env_config);
+}
+
+std::vector<TestCase> generateTestsParams(std::initializer_list<std::string> fields) {
+    std::vector<TestCase> tests_cases;
+    const pugi::xml_document & test_config = Environment::Instance().getTestConfig();
+    std::string models_path = Environment::Instance().getEnvConfig()
+            .child("attributes").child("irs_path").child("value").text().as_string();
+
+    std::vector<int> processes;
+    std::vector<int> threads;
+    std::vector<int> iterations;
+    std::vector<std::string> devices;
+    std::vector<std::string> models;
+
+    pugi::xml_node values;
+    for (auto field = fields.begin(); field != fields.end(); field++) {
+        if (*field == "processes") {
+            values = test_config.child("attributes").child("processes");
+            for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+                processes.push_back(val.text().as_int());
+        } else if (*field == "threads") {
+            values = test_config.child("attributes").child("threads");
+            for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+                threads.push_back(val.text().as_int());
+        } else if (*field == "iterations") {
+            values = test_config.child("attributes").child("iterations");
+            for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+                iterations.push_back(val.text().as_int());
+        } else if (*field == "devices") {
+            values = test_config.child("attributes").child("devices");
+            for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+                devices.push_back(val.text().as_string());
+        } else if (*field == "models") {
+            values = test_config.child("attributes").child("models");
+            for (pugi::xml_node val = values.first_child(); val; val = val.next_sibling())
+                models.push_back(val.text().as_string());
+        }
+    }
+
+    // Initialize variables with default value if it weren't filled
+    processes = !processes.empty() ? processes: std::vector<int>{1};
+    threads = !threads.empty() ? threads: std::vector<int>{1};
+    iterations = !iterations.empty() ? iterations: std::vector<int>{1};
+    devices = !devices.empty() ? devices : std::vector<std::string>{"NULL"};
+    models = !models.empty() ? models : std::vector<std::string>{"NULL"};
+
+    for (auto &numprocesses : processes)
+        for (auto &numthreads : threads)
+            for (auto &numiters : iterations)
+                for (auto &device : devices)
+                    for (auto &model : models)
+                        tests_cases.push_back(TestCase(numprocesses, numthreads, numiters, device, OS_PATH_JOIN({models_path, model}), model));
+
+    return tests_cases;
+}
+
+std::string getTestCaseName(const testing::TestParamInfo<TestCase> &obj) {
+    return obj.param.test_case_name;
+}
+
+void test_wrapper(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase &params) {
+    tests_pipeline(params.model, params.device, params.numiters);
+}
+
+void _runTest(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase &params) {
+    run_in_threads(params.numthreads, test_wrapper, tests_pipeline, params);
+}
+
+void runTest(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase &params) {
+#if DEBUG_MODE
+    tests_pipeline(params.model, params.device, params.numiters);
+#else
+    int status = run_in_processes(params.numprocesses, _runTest, tests_pipeline, params);
+    ASSERT_EQ(status, 0) << "Test failed with exitcode " << std::to_string(status);
+#endif
+}
+
diff --git a/tests/stress_tests/common/tests_utils.h b/tests/stress_tests/common/tests_utils.h
new file mode 100644 (file)
index 0000000..d2acf0b
--- /dev/null
@@ -0,0 +1,75 @@
+#pragma once
+
+#include "utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+#include <string>
+#include <vector>
+#include <thread>
+#include <unistd.h>
+#include <sys/wait.h>
+
+enum TestStatus
+{
+    TEST_NOT_STARTED = 0,
+    TEST_FAILED,
+    TEST_OK
+};
+
+using TestResult = std::pair<TestStatus, std::string>;
+
+class TestCase {
+public:
+    int numprocesses;
+    int numthreads;
+    int numiters;
+    std::string device;
+    std::string model_name;
+    std::string model;
+    std::string test_case_name;
+
+    TestCase(int _numprocesses, int _numthreads, int _numiters, std::string _device, const std::string& _model, const std::string& _model_name) {
+        numprocesses = _numprocesses, numthreads = _numthreads, numiters = _numiters, device = _device, model = _model, model_name = _model_name;
+        test_case_name =
+                "Numprocesses_" + std::to_string(numprocesses) + "_Numthreads_" + std::to_string(numthreads) +
+                "_Numiters_" + std::to_string(numiters) + "_Device_" + update_item_for_name(device) + "_Model_" + 
+                update_item_for_name(model_name);
+    }
+
+private:
+    std::string update_item_for_name(const std::string &item) {
+        std::string _item(item);
+        for (std::string::size_type index = 0; index < _item.size(); ++index) {
+            if (!isalnum(_item[index]) && _item[index] != '_')
+                _item[index] = '_';
+        }
+        return _item;
+    }
+};
+
+class Environment {
+private:
+    pugi::xml_document _test_config;
+    pugi::xml_document _env_config;
+    Environment() = default;
+    Environment(const Environment&) = delete;
+    Environment& operator=(const Environment&) = delete;
+public:
+    static Environment& Instance(){
+        static Environment env;
+        return env;
+    }
+
+    const pugi::xml_document & getTestConfig();
+    void setTestConfig(const pugi::xml_document &test_config);
+    const pugi::xml_document & getEnvConfig();
+    void setEnvConfig(const pugi::xml_document &env_config);
+};
+
+std::vector<TestCase> generateTestsParams(std::initializer_list<std::string> items);
+std::string getTestCaseName(const testing::TestParamInfo<TestCase> &obj);
+
+void runTest(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase &params);
+void _runTest(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase &params);
+void test_wrapper(const std::function<void(std::string, std::string, int)> &tests_pipeline, const TestCase &params);
diff --git a/tests/stress_tests/common/utils.cpp b/tests/stress_tests/common/utils.cpp
new file mode 100644 (file)
index 0000000..607c3f7
--- /dev/null
@@ -0,0 +1,64 @@
+#include "utils.h"
+
+#include <string>
+#include <string.h>
+
+std::string OS_PATH_JOIN(std::initializer_list<std::string> list) {
+    if (!list.size())
+        return "";
+    std::string res = *list.begin();
+    for (auto it = list.begin() + 1; it != list.end(); it++) {
+        res += OS_SEP + *it;
+    }
+    return res;
+}
+
+std::string fileNameNoExt(const std::string &filepath) {
+    auto pos = filepath.rfind('.');
+    if (pos == std::string::npos) return filepath;
+    return filepath.substr(0, pos);
+}
+
+
+static size_t parseLine(char* line) {
+    // This assumes that a digit will be found and the line ends in " Kb".
+    size_t i = strlen(line);
+    const char* p = line;
+    while (*p <'0' || *p > '9') p++;
+    line[i-3] = '\0';
+    i = (size_t)atoi(p);
+    return i;
+}
+
+#ifdef _WIN32
+size_t getVmSizeInKB() {
+                // TODO rewrite for Virtual Memory
+                PROCESS_MEMORY_COUNTERS pmc;
+                pmc.cb = sizeof(PROCESS_MEMORY_COUNTERS);
+                GetProcessMemoryInfo(GetCurrentProcess(),&pmc, pmc.cb);
+                return pmc.WorkingSetSize;
+           }
+#else
+size_t getVirtualMemoryInKB(char *name){
+    FILE* file = fopen("/proc/self/status", "r");
+    size_t result = 0;
+    if (file != nullptr) {
+        char line[128];
+
+        while (fgets(line, 128, file) != NULL) {
+            if (strncmp(line, name, strlen(name)) == 0) {
+                result = parseLine(line);
+                break;
+            }
+        }
+        fclose(file);
+    }
+    return result;
+}
+
+size_t getVmSizeInKB() {return getVirtualMemoryInKB((char*) "VmSize:");}
+size_t getVmPeakInKB() {return getVirtualMemoryInKB((char*) "VmPeak:");}
+size_t getVmRSSInKB() {return getVirtualMemoryInKB((char*) "VmRSS:");}
+size_t getVmHWMInKB() {return getVirtualMemoryInKB((char*) "VmHWM:");}
+
+#endif
diff --git a/tests/stress_tests/common/utils.h b/tests/stress_tests/common/utils.h
new file mode 100644 (file)
index 0000000..7e82d12
--- /dev/null
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <thread>
+#include <functional>
+#include <sys/unistd.h>
+#include <sys/wait.h>
+
+#ifdef _WIN32
+#define OS_SEP std::string("\\")
+#else
+#define OS_SEP std::string("/")
+#endif
+
+
+#define log_info(str) std::cout << "[ INFO ] " << str << std::endl
+#define log_warn(str) std::cout << "[ WARNING ] " << str << std::endl
+#define log_err(str) std::cout << "[ ERROR ] " << str << std::endl
+#define log_debug(str) std::cout << "[ DEBUG ] " << str << std::endl
+
+std::string OS_PATH_JOIN(std::initializer_list<std::string> list);
+
+std::string fileNameNoExt(const std::string &filepath);
+
+#define getVmValues(vmsize, vmpeak, vmrss, vmhwm) vmsize = (long) getVmSizeInKB();    \
+                                                  vmpeak = (long) getVmPeakInKB();    \
+                                                  vmrss = (long) getVmRSSInKB();      \
+                                                  vmhwm = (long) getVmHWMInKB();
+
+size_t getVmSizeInKB();
+size_t getVmPeakInKB();
+size_t getVmRSSInKB();
+size_t getVmHWMInKB();
+
+template<typename Function, typename ... Args>
+int run_in_processes(const int &numprocesses, Function const &function, Args ... args) {
+    std::vector<pid_t> child_pids(numprocesses);
+
+    for (int i = 0; i < numprocesses; i++) {
+        child_pids[i] = fork();
+        if (child_pids[i] == 0) {
+            function(args...);
+            exit(EXIT_SUCCESS);
+        }
+    }
+
+    int status = 0;
+    for (int i = 0; i < numprocesses; i++) {
+        int _status = 0;
+        waitpid(child_pids[i], &_status, WSTOPPED);
+        if (_status) {
+            log_err("Process run # " << i << " failed with exitcode " << _status);
+            status = _status;
+        }
+    }
+    return status;
+}
+
+template<typename Function, typename ... Args>
+inline void run_in_threads(const int &numthreads, Function const &function, Args ... args) {
+    std::vector<std::thread> v(numthreads);
+    for (int thr_i = 0; thr_i < numthreads; thr_i++) {
+        v[thr_i] = std::thread(function, args...);
+    }
+
+    for (int thr_i = 0; thr_i < numthreads; thr_i++) {
+        v[thr_i].join();
+    }
+    v.clear();
+}
diff --git a/tests/stress_tests/memcheck_tests/CMakeLists.txt b/tests/stress_tests/memcheck_tests/CMakeLists.txt
new file mode 100644 (file)
index 0000000..cb7c630
--- /dev/null
@@ -0,0 +1,38 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "MemCheckTests")
+
+file (GLOB SRC
+        ../common/*.cpp
+        ../common/ie_pipelines/*.cpp
+        *.cpp
+        tests_pipelines/*.cpp)
+
+file (GLOB HDR
+        ../common/*.h
+        ../common/ie_pipelines/*.h
+        *.h
+        tests_pipelines/*.h)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${HDR} ${SRC})
+
+find_package(gflags REQUIRED)
+
+target_link_libraries(${TARGET_NAME}
+        IE::gtest
+        IE::gtest_main
+        IE::pugixml
+        gflags
+        ${InferenceEngine_LIBRARIES}
+        )
+
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+        "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src")
+
+# Copy local configs to BIN_FOLDER
+configure_file(local_configs/test_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memcheck_tests/test_config.xml COPYONLY)
+configure_file(local_configs/env_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memcheck_tests/env_config.xml COPYONLY)
+configure_file(local_configs/references_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memcheck_tests/references_config.xml COPYONLY)
diff --git a/tests/stress_tests/memcheck_tests/flags.h b/tests/stress_tests/memcheck_tests/flags.h
new file mode 100644 (file)
index 0000000..9bd09da
--- /dev/null
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "../common/utils.h"
+
+#include <gflags/gflags.h>
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Declare flag for showing help message <br>
+DECLARE_bool(help);
+
+/// @brief message for test_config argument
+static const char test_conf_message[] = "Optional. Path to a test config with description about number of threads, iterations etc.";
+
+/// @brief Define parameter for set test's configuration <br>
+/// test_conf is an optional parameter
+DEFINE_string(test_conf, OS_PATH_JOIN({"stress_tests_configs", "memcheck_tests", "test_config.xml"}), test_conf_message);
+
+/// @brief message for env_config argument
+static const char env_conf_message[] = "Optional. Path to an env config with paths to models etc.";
+
+/// @brief Define parameter for set environment <br>
+/// env_conf is an optional parameter
+DEFINE_string(env_conf, OS_PATH_JOIN({"stress_tests_configs", "memcheck_tests", "env_config.xml"}), env_conf_message);
+
+/// @brief message for env_config argument
+static const char refs_conf_message[] = "Optional. Path to a references config with values of memory consumption per test.";
+
+/// @brief Define parameter for set references' configuration <br>
+/// refs_conf is an optional parameter
+DEFINE_string(refs_conf, OS_PATH_JOIN({"stress_tests_configs", "memcheck_tests", "references_config.xml"}), refs_conf_message);
\ No newline at end of file
diff --git a/tests/stress_tests/memcheck_tests/local_configs/env_config.xml b/tests/stress_tests/memcheck_tests/local_configs/env_config.xml
new file mode 100644 (file)
index 0000000..ffcac86
--- /dev/null
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<attributes>
+    <irs_path>
+        <value>/nfs/inn/proj/vdp/vdp_tests/stress_tests/open_model_zoo/efd238d02035f8a5417b7b1e25cd4c997d44351f/IRs</value>
+    </irs_path>
+</attributes>
diff --git a/tests/stress_tests/memcheck_tests/local_configs/references_config.xml b/tests/stress_tests/memcheck_tests/local_configs/references_config.xml
new file mode 100644 (file)
index 0000000..35b701b
--- /dev/null
@@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<attributes>
+    <models>
+<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" test="create_exenetwork" device="CPU" vmsize="757218" vmpeak="901683" vmrss="73920" vmhwm="107866" />
+<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" test="create_exenetwork" device="GPU" vmsize="747815" vmpeak="860978" vmrss="401808" vmhwm="435358" />
+<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" test="infer_request_inference" device="CPU" vmsize="1001189" vmpeak="1001189" vmrss="116080" vmhwm="116080" />
+<model path="public/mobilenet-ssd/FP32/mobilenet-ssd.xml" test="infer_request_inference" device="GPU" vmsize="788752" vmpeak="860842" vmrss="435283" vmhwm="435283" />
+<model path="public/mtcnn-r/FP32/mtcnn-r.xml" test="create_exenetwork" device="CPU" vmsize="754806" vmpeak="803184" vmrss="15206" vmhwm="26532" />
+<model path="public/mtcnn-r/FP32/mtcnn-r.xml" test="create_exenetwork" device="GPU" vmsize="554650" vmpeak="644666" vmrss="207592" vmhwm="217720" />
+<model path="public/mtcnn-r/FP32/mtcnn-r.xml" test="infer_request_inference" device="CPU" vmsize="959257" vmpeak="959257" vmrss="26690" vmhwm="26690" />
+<model path="public/mtcnn-r/FP32/mtcnn-r.xml" test="infer_request_inference" device="GPU" vmsize="572576" vmpeak="644666" vmrss="215230" vmhwm="215230" />
+<model path="public/ssd300/FP32/ssd300.xml" test="create_exenetwork" device="CPU" vmsize="755224" vmpeak="1146142" vmrss="22246" vmhwm="370770" />
+<model path="public/ssd300/FP32/ssd300.xml" test="create_exenetwork" device="GPU" vmsize="747709" vmpeak="1031694" vmrss="401746" vmhwm="749962" />
+<model path="public/ssd300/FP32/ssd300.xml" test="infer_request_inference" device="CPU" vmsize="1343474" vmpeak="1415563" vmrss="314204" vmhwm="371131" />
+<model path="public/ssd300/FP32/ssd300.xml" test="infer_request_inference" device="GPU" vmsize="1088700" vmpeak="1160790" vmrss="739626" vmhwm="748008" />
+<model path="public/vgg16/FP32/vgg16.xml" test="create_exenetwork" device="CPU" vmsize="754050" vmpeak="2548532" vmrss="15593" vmhwm="1808765" />
+<model path="public/vgg16/FP32/vgg16.xml" test="create_exenetwork" device="GPU" vmsize="648912" vmpeak="3289101" vmrss="299327" vmhwm="3003457" />
+<model path="public/vgg16/FP32/vgg16.xml" test="infer_request_inference" device="CPU" vmsize="2257006" vmpeak="2548532" vmrss="1243448" vmhwm="1809143" />
+<model path="public/vgg16/FP32/vgg16.xml" test="infer_request_inference" device="GPU" vmsize="2413290" vmpeak="3289101" vmrss="2059780" vmhwm="3006845" />
+    </models>
+</attributes>
diff --git a/tests/stress_tests/memcheck_tests/local_configs/test_config.xml b/tests/stress_tests/memcheck_tests/local_configs/test_config.xml
new file mode 100644 (file)
index 0000000..9944819
--- /dev/null
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<attributes>
+    <devices>
+        <value>CPU</value>
+        <value>GPU</value>
+    </devices>
+    <models>
+        <value>public/vgg16/FP32/vgg16.xml</value>
+        <value>public/mtcnn-r/FP32/mtcnn-r.xml</value>
+        <value>public/mobilenet-ssd/FP32/mobilenet-ssd.xml</value>
+        <value>public/ssd300/FP32/ssd300.xml</value>
+    </models>
+</attributes>
diff --git a/tests/stress_tests/memcheck_tests/main.cpp b/tests/stress_tests/memcheck_tests/main.cpp
new file mode 100644 (file)
index 0000000..4d235d1
--- /dev/null
@@ -0,0 +1,65 @@
+#include "flags.h"
+#include "../common/utils.h"
+#include <tests_utils.h>
+#include "../common/tests_utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+
+
+bool parseAndCheckCommandLine(int argc, char **argv) {
+    // ---------------------------Parsing and validating input arguments--------------------------------------
+    log_info("Parsing input parameters");
+
+    int new_argc = 0;
+    std::vector<char*> _argv;
+    for (int i = 0; i < argc; i++) {
+        if ("--gtest" != std::string(argv[i]).substr(0, 7)) {
+            _argv.push_back(argv[i]);
+            new_argc++;
+        }
+    }
+    char **new_argv = &_argv[0];
+    gflags::ParseCommandLineNonHelpFlags(&new_argc, &new_argv, true);
+
+    if (FLAGS_help || FLAGS_h) {
+        // TODO print info
+        //::testing::InitGoogleTest(&argc, argv);
+        return false;
+    }
+
+    pugi::xml_document config;
+    pugi::xml_parse_result result = config.load_file(FLAGS_test_conf.c_str());
+    if (!result) {
+        log_err("Exception while reading test config \"" << FLAGS_test_conf << "\": " << result.description());
+        return false;
+    }
+    result = config.load_file(FLAGS_env_conf.c_str());
+    if (!result) {
+        log_err("Exception while reading env config \"" << FLAGS_env_conf << "\": " << result.description());
+        return false;
+    }
+    result = config.load_file(FLAGS_refs_conf.c_str());
+    if (!result) {
+        log_err("Exception while reading references config \"" << FLAGS_refs_conf << "\": " << result.description());
+        return false;
+    }
+    return true;
+}
+
+
+int main(int argc, char **argv) {
+    if (!parseAndCheckCommandLine(argc, argv)) {
+        return 0;   // TODO return correct status
+    }
+
+    pugi::xml_document config;
+    config.load_file(FLAGS_test_conf.c_str());
+    Environment::Instance().setTestConfig(config);
+    config.load_file(FLAGS_env_conf.c_str());
+    Environment::Instance().setEnvConfig(config);
+    config.load_file(FLAGS_refs_conf.c_str());
+    MemCheckEnvironment::Instance().setRefsConfig(config);
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/tests/stress_tests/memcheck_tests/tests.cpp b/tests/stress_tests/memcheck_tests/tests.cpp
new file mode 100644 (file)
index 0000000..5383a32
--- /dev/null
@@ -0,0 +1,52 @@
+#include "tests_utils.h"
+#include "../common/tests_utils.h"
+#include "../common/managers/thread_manager.h"
+#include "tests_pipelines/tests_pipelines.h"
+
+#include <gtest/gtest.h>
+
+#define checkRefVmValues()                                                                                             \
+    ASSERT_GT(test_refs.ref_vmsize, 0) << "Reference value of VmSize is less than 0. Value: " << test_refs.ref_vmsize;  \
+    ASSERT_GT(test_refs.ref_vmsize, 0) << "Reference value of VmPeak is less than 0. Value: " << test_refs.ref_vmpeak;  \
+    ASSERT_GT(test_refs.ref_vmrss, 0) << "Reference value of VmRSS is less than 0. Value: " << test_refs.ref_vmrss;     \
+    ASSERT_GT(test_refs.ref_vmrss, 0) << "Reference value of VmHWM is less than 0. Value: " << test_refs.ref_vmhwm;
+
+class MemCheckTestSuite : public ::testing::TestWithParam<TestCase> {
+};
+
+// tests_pipelines/tests_pipelines.cpp
+TEST_P(MemCheckTestSuite, create_exenetwork) {
+    std::string test_name = "create_exenetwork";
+    auto test_params = GetParam();
+
+    TestReferences test_refs;
+    test_refs.collect_vm_values_for_test(test_name, test_params);
+
+    checkRefVmValues();
+
+    TestResult res = test_create_exenetwork(test_params.model_name, test_params.model, test_params.device,
+                                            test_refs.ref_vmsize, test_refs.ref_vmpeak, test_refs.ref_vmrss,
+                                            test_refs.ref_vmhwm);
+    EXPECT_EQ(res.first, TestStatus::TEST_OK) << res.second;
+}
+
+TEST_P(MemCheckTestSuite, infer_request_inference) {
+    std::string test_name = "infer_request_inference";
+    auto test_params = GetParam();
+
+    TestReferences test_refs;
+    test_refs.collect_vm_values_for_test(test_name, test_params);
+
+    checkRefVmValues();
+
+    TestResult res = test_infer_request_inference(test_params.model_name, test_params.model, test_params.device,
+                                                  test_refs.ref_vmsize, test_refs.ref_vmpeak, test_refs.ref_vmrss,
+                                                  test_refs.ref_vmhwm);
+    EXPECT_EQ(res.first, TestStatus::TEST_OK) << res.second;
+}
+// tests_pipelines/tests_pipelines.cpp
+
+INSTANTIATE_TEST_CASE_P(MemCheckTests, MemCheckTestSuite,
+                        ::testing::ValuesIn(
+                                generateTestsParams({"devices", "models"})),
+                        getTestCaseName);
diff --git a/tests/stress_tests/memcheck_tests/tests_pipelines/tests_pipelines.cpp b/tests/stress_tests/memcheck_tests/tests_pipelines/tests_pipelines.cpp
new file mode 100644 (file)
index 0000000..58204a8
--- /dev/null
@@ -0,0 +1,124 @@
+#include "tests_pipelines.h"
+
+#include <string>
+#include <math.h>
+#include <chrono>
+
+#include <inference_engine.hpp>
+
+#define REPORTING_THRESHOLD 1.1
+
+using namespace InferenceEngine;
+
+#define getAlignedVmValues(vmsize, vmpeak, vmrss, vmhwm, vmsize_to_align, vmrss_to_align)   \
+        getVmValues(test_cur_vmsize, test_cur_vmpeak, test_cur_vmrss, test_cur_vmhwm);      \
+        test_cur_vmsize -= vmsize_before_test;                                              \
+        test_cur_vmpeak -= vmsize_before_test;                                              \
+        test_cur_vmrss -= vmrss_before_test;                                                \
+        test_cur_vmhwm -= vmrss_before_test;
+
+#define log_debug_ref_record_for_test(test_name)                                                            \
+        log_debug("Record to update reference config: "                                                           \
+                  << "<model path=\"" + model_name + "\"" + " test=\"" + test_name + "\" device=\"" +       \
+                  target_device +                                                                           \
+                  "\" vmsize=\"" + std::to_string((int) (test_cur_vmsize * REPORTING_THRESHOLD)) +          \
+                  "\" vmpeak=\"" + std::to_string((int) (test_cur_vmpeak * REPORTING_THRESHOLD)) +          \
+                  "\" vmrss=\"" + std::to_string((int) (test_cur_vmrss * REPORTING_THRESHOLD)) +            \
+                  "\" vmhwm=\"" + std::to_string((int) (test_cur_vmhwm * REPORTING_THRESHOLD)) + "\" />");
+
+#define log_info_ref_mem_usage()                                                                \
+        log_info("Reference values of virtual memory consumption:");                            \
+        log_info("VMRSS\t\tVMHWM\t\tVMSIZE\t\tVMPEAK");                                               \
+        log_info(ref_vmrss << "\t\t" << ref_vmhwm << "\t\t" << ref_vmsize << "\t\t" << ref_vmpeak);
+
+#define log_info_cur_mem_usage()                                                                                    \
+        log_info("Current values of virtual memory consumption:");                                                  \
+        log_info("VMRSS\t\tVMHWM\t\tVMSIZE\t\tVMPEAK");                                                                   \
+        log_info(test_cur_vmrss << "\t\t" << test_cur_vmhwm << "\t\t" << test_cur_vmsize << "\t\t" << test_cur_vmpeak);
+
+TestResult
+test_create_exenetwork(const std::string &model_name, const std::string &model_path, const std::string &target_device,
+                       const long &ref_vmsize, const long &ref_vmpeak, const long &ref_vmrss, const long &ref_vmhwm) {
+    log_info("Create ExecutableNetwork from network: \"" << model_path
+                                                         << "\" for device: \"" << target_device << "\"");
+    long vmsize_before_test = 0, vmrss_before_test = 0,
+            test_cur_vmsize = 0, test_cur_vmpeak = 0,
+            test_cur_vmrss = 0, test_cur_vmhwm = 0;
+
+    vmsize_before_test = (long) getVmSizeInKB();
+    vmrss_before_test = (long) getVmRSSInKB();
+
+    create_exenetwork(model_path, target_device)();
+
+    getAlignedVmValues(test_cur_vmsize, test_cur_vmpeak, test_cur_vmrss, test_cur_vmhwm,
+                       vmsize_before_test, vmrss_before_test);
+
+    log_debug_ref_record_for_test("create_exenetwork");
+    log_info_ref_mem_usage();
+    log_info_cur_mem_usage();
+
+    if (test_cur_vmhwm > ref_vmhwm)
+        return TestResult(TestStatus::TEST_FAILED,
+                          "Test failed: HWM (peak of RSS) virtual memory consumption is greater than reference.\n"
+                          "Reference HWM of memory consumption: " + std::to_string(ref_vmhwm) + " KB.\n" +
+                          "Current HWM of memory consumption: " + std::to_string(test_cur_vmhwm) + " KB.\n");
+
+    return TestResult(TestStatus::TEST_OK, "");
+}
+
+TestResult
+test_infer_request_inference(const std::string &model_name, const std::string &model_path,
+                             const std::string &target_device,
+                             const long &ref_vmsize, const long &ref_vmpeak, const long &ref_vmrss,
+                             const long &ref_vmhwm) {
+    log_info("Inference of InferRequest from network: \"" << model_path
+                                                          << "\" for device: \"" << target_device << "\"");
+    long vmsize_before_test = 0, vmrss_before_test = 0,
+            test_cur_vmsize = 0, test_cur_vmpeak = 0,
+            test_cur_vmrss = 0, test_cur_vmhwm = 0;
+    std::chrono::system_clock::time_point t_start, t_end;
+    std::chrono::duration<double> t_diff;
+
+    vmsize_before_test = (long) getVmSizeInKB();
+    vmrss_before_test = (long) getVmRSSInKB();
+
+    Core ie;
+    CNNNetwork cnnNetwork = ie.ReadNetwork(model_path);
+    ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    InferRequest infer_request = exeNetwork.CreateInferRequest();
+
+    log_info_ref_mem_usage();
+
+    t_start = std::chrono::system_clock::now();
+    int seconds = 1;
+    do {
+        infer_request.Infer();
+        OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+        for (auto &output : output_info)
+            Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+        t_end = std::chrono::system_clock::now();
+        t_diff = t_end - t_start;
+
+        getAlignedVmValues(test_cur_vmsize, test_cur_vmpeak, test_cur_vmrss, test_cur_vmhwm,
+                           vmsize_before_test, vmrss_before_test);
+
+        if (test_cur_vmrss > ref_vmrss) {
+            log_debug_ref_record_for_test("infer_request_inference");
+            return TestResult(TestStatus::TEST_FAILED,
+                              "Test failed: RSS virtual memory consumption became greater than reference "
+                              "after " + std::to_string(t_diff.count()) + " sec of inference.\n"
+                              "Reference RSS memory consumption: " + std::to_string(ref_vmrss) + " KB.\n" +
+                              "Current RSS memory consumption: " + std::to_string(test_cur_vmrss) + " KB.\n");
+        }
+
+        if (t_diff.count() > (double) (seconds)) {
+            log_info("Current values of virtual memory consumption after " << seconds << " seconds:");
+            log_info("VMRSS\t\tVMHWM\t\tVMSIZE\t\tVMPEAK");
+            log_info(test_cur_vmrss << "\t\t" << test_cur_vmhwm << "\t\t" << test_cur_vmsize << "\t\t" << test_cur_vmpeak);
+            seconds++;
+        }
+    } while (t_diff.count() < 5);
+    log_debug_ref_record_for_test("infer_request_inference");
+
+    return TestResult(TestStatus::TEST_OK, "");
+}
diff --git a/tests/stress_tests/memcheck_tests/tests_pipelines/tests_pipelines.h b/tests/stress_tests/memcheck_tests/tests_pipelines/tests_pipelines.h
new file mode 100644 (file)
index 0000000..0712bca
--- /dev/null
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "../../common/tests_utils.h"
+#include "../../common/utils.h"
+#include "../../common/ie_pipelines/pipelines.h"
+
+#include <string>
+
+// tests_pipelines/tests_pipelines.cpp
+TestResult test_create_exenetwork(const std::string &model_name, const std::string &model_path, const std::string &target_device,
+                                  const long &ref_vmsize, const long &ref_vmpeak, const long &ref_vmrss, const long &ref_vmhwm);
+TestResult test_infer_request_inference(const std::string &model_name, const std::string &model_path, const std::string &target_device,
+                                        const long &ref_vmsize, const long &ref_vmpeak, const long &ref_vmrss, const long &ref_vmhwm);
+// tests_pipelines/tests_pipelines.cpp
diff --git a/tests/stress_tests/memcheck_tests/tests_utils.h b/tests/stress_tests/memcheck_tests/tests_utils.h
new file mode 100644 (file)
index 0000000..32afff1
--- /dev/null
@@ -0,0 +1,72 @@
+#include "../common/tests_utils.h"
+
+#include <pugixml.hpp>
+
+class MemCheckEnvironment {
+private:
+    pugi::xml_document _refs_config;
+    MemCheckEnvironment() = default;
+    MemCheckEnvironment(const MemCheckEnvironment&) = delete;
+    MemCheckEnvironment& operator=(const MemCheckEnvironment&) = delete;
+public:
+    static MemCheckEnvironment& Instance(){
+        static MemCheckEnvironment env;
+        return env;
+    }
+
+    const pugi::xml_document & getRefsConfig() {
+        return _refs_config;
+    }
+
+    void setRefsConfig(const pugi::xml_document &refs_config) {
+        _refs_config.reset(refs_config);
+    }
+};
+
+class TestReferences {
+private:
+    std::vector<std::string> model_path_v, test_name_v, device_v;
+    std::vector<long> vmsize_v, vmpeak_v, vmrss_v, vmhwm_v;
+public:
+    long ref_vmsize = -1, ref_vmpeak = -1, ref_vmrss = -1, ref_vmhwm = -1;
+
+    TestReferences () {
+        // Parse RefsConfig from MemCheckEnvironment
+        std::string models_path = Environment::Instance().getEnvConfig()
+                .child("attributes").child("irs_path").child("value").text().as_string();
+
+        const pugi::xml_document &refs_config = MemCheckEnvironment::Instance().getRefsConfig();
+        auto values = refs_config.child("attributes").child("models");
+        for (pugi::xml_node node = values.first_child(); node; node = node.next_sibling()) {
+            for (pugi::xml_attribute_iterator ait = node.attributes_begin(); ait != node.attributes_end(); ait++) {
+                if (strncmp(ait->name(), "path", strlen(ait->name())) == 0) {
+                    model_path_v.push_back(OS_PATH_JOIN({models_path, ait->value()}));
+                } else if (strncmp(ait->name(), "test", strlen(ait->name())) == 0) {
+                    test_name_v.push_back(ait->value());
+                } else if (strncmp(ait->name(), "device", strlen(ait->name())) == 0) {
+                    device_v.push_back(ait->value());
+                } else if (strncmp(ait->name(), "vmsize", strlen(ait->name())) == 0) {
+                    vmsize_v.push_back(std::atoi(ait->value()));
+                } else if (strncmp(ait->name(), "vmpeak", strlen(ait->name())) == 0) {
+                    vmpeak_v.push_back(std::atoi(ait->value()));
+                } else if (strncmp(ait->name(), "vmrss", strlen(ait->name())) == 0) {
+                    vmrss_v.push_back(std::atoi(ait->value()));
+                } else if (strncmp(ait->name(), "vmhwm", strlen(ait->name())) == 0) {
+                    vmhwm_v.push_back(std::atoi(ait->value()));
+                }
+            }
+        }
+    }
+
+    void collect_vm_values_for_test(std::string test_name, TestCase test_params) {
+        for (int i = 0; i < test_name_v.size(); i++)
+            if (test_name_v[i] == test_name)
+                if (model_path_v[i] == test_params.model)
+                    if (device_v[i] == test_params.device) {
+                        ref_vmsize = vmsize_v[i];
+                        ref_vmpeak = vmpeak_v[i];
+                        ref_vmrss = vmrss_v[i];
+                        ref_vmhwm = vmhwm_v[i];
+                    }
+    }
+};
\ No newline at end of file
diff --git a/tests/stress_tests/memleaks_tests/CMakeLists.txt b/tests/stress_tests/memleaks_tests/CMakeLists.txt
new file mode 100644 (file)
index 0000000..a396454
--- /dev/null
@@ -0,0 +1,40 @@
+# Copyright (C) 2018-2019 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "StressMemLeaksTests")
+
+file (GLOB SRC
+        ../common/*.cpp
+        ../common/ie_pipelines/*.cpp
+        *.cpp
+        tests_pipelines/*.cpp)
+
+file (GLOB HDR
+        ../common/*.h
+        ../common/managers/*.h
+        ../common/ie_pipelines/*.h
+        *.h
+        tests_pipelines/*.h)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${HDR} ${SRC})
+
+find_package(gflags REQUIRED)
+find_package(Threads REQUIRED)
+
+target_link_libraries(${TARGET_NAME}
+        IE::gtest
+        IE::gtest_main
+        IE::pugixml
+        gflags
+        Threads::Threads
+        ${InferenceEngine_LIBRARIES}
+        )
+
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+        "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src")
+
+# Copy local configs to BIN_FOLDER
+configure_file(local_configs/test_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memleaks_tests/test_config.xml COPYONLY)
+configure_file(local_configs/env_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/memleaks_tests/env_config.xml COPYONLY)
diff --git a/tests/stress_tests/memleaks_tests/flags.h b/tests/stress_tests/memleaks_tests/flags.h
new file mode 100644 (file)
index 0000000..9687797
--- /dev/null
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../common/utils.h"
+
+#include <gflags/gflags.h>
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Declare flag for showing help message <br>
+DECLARE_bool(help);
+
+/// @brief message for test_config argument
+static const char test_conf_message[] = "Optional. Path to a test config with description about number of threads, iterations etc.";
+
+/// @brief Define parameter for set test's configuration <br>
+/// test_conf is an optional parameter
+DEFINE_string(test_conf, OS_PATH_JOIN({"stress_tests_configs", "memleaks_tests", "test_config.xml"}), test_conf_message);
+
+/// @brief message for env_config argument
+static const char env_conf_message[] = "Optional. Path to an env config with paths to models etc.";
+
+/// @brief Define parameter for set environment <br>
+/// env_conf is an optional parameter
+DEFINE_string(env_conf, OS_PATH_JOIN({"stress_tests_configs", "memleaks_tests", "env_config.xml"}), env_conf_message);
\ No newline at end of file
diff --git a/tests/stress_tests/memleaks_tests/local_configs/env_config.xml b/tests/stress_tests/memleaks_tests/local_configs/env_config.xml
new file mode 100644 (file)
index 0000000..7d356d0
--- /dev/null
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<attributes>
+    <irs_path>
+        <value>/nfs/inn/proj/vdp/vdp_tests/stress_tests/master_04d6f112132f92cab563ae7655747e0359687dc9/</value>
+    </irs_path>
+</attributes>
diff --git a/tests/stress_tests/memleaks_tests/local_configs/test_config.xml b/tests/stress_tests/memleaks_tests/local_configs/test_config.xml
new file mode 100644 (file)
index 0000000..28698b5
--- /dev/null
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<attributes>
+<!--[ WARNING ] Use of attribute "processes" from config isn't implemented yet. It will be ignored.-->
+    <processes>
+        <value>1</value>
+    </processes>
+    <threads>
+        <value>1</value>
+    </threads>
+    <iterations>
+        <value>30</value>
+    </iterations>
+    <devices>
+        <value>CPU</value>
+<!--        <value>GPU</value>-->
+    </devices>
+    <models>
+        <value>caffe/FP32/alexnet/alexnet.xml</value>
+    </models>
+</attributes>
diff --git a/tests/stress_tests/memleaks_tests/main.cpp b/tests/stress_tests/memleaks_tests/main.cpp
new file mode 100644 (file)
index 0000000..93752ab
--- /dev/null
@@ -0,0 +1,60 @@
+#include "flags.h"
+#include "../common/utils.h"
+#include "../common/tests_utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+
+
+bool parseAndCheckCommandLine(int argc, char **argv) {
+    // ---------------------------Parsing and validating input arguments--------------------------------------
+    log_info("Parsing input parameters");
+
+    int new_argc = 0;
+    std::vector<char*> _argv;
+    for (int i = 0; i < argc; i++) {
+        if ("--gtest" != std::string(argv[i]).substr(0, 7)) {
+            _argv.push_back(argv[i]);
+            new_argc++;
+        }
+    }
+    char **new_argv = &_argv[0];
+    gflags::ParseCommandLineNonHelpFlags(&new_argc, &new_argv, true);
+
+    if (FLAGS_help || FLAGS_h) {
+        // TODO print info
+        //::testing::InitGoogleTest(&argc, argv);
+        return false;
+    }
+
+    pugi::xml_document config;
+    pugi::xml_parse_result result = config.load_file(FLAGS_test_conf.c_str());
+    if (!result) {
+        log_err("Exception while reading test config \"" << FLAGS_test_conf << "\": " << result.description());
+        return false;
+    }
+    result = config.load_file(FLAGS_env_conf.c_str());
+    if (!result) {
+        log_err("Exception while reading env config \"" << FLAGS_env_conf << "\": " << result.description());
+        return false;
+    }
+    return true;
+}
+
+
+int main(int argc, char **argv) {
+    log_warn("Use of attribute \"processes\" from config isn't implemented yet. It will be ignored.");
+    log_warn("Use of attribute \"threads\" from config greater than 1 is risky because of "
+             "no synchronization between steps from different threads. Tests results may be non-deterministic.");
+    if (!parseAndCheckCommandLine(argc, argv)) {
+        return 0;   // TODO return correct status
+    }
+
+    pugi::xml_document config;
+    config.load_file(FLAGS_test_conf.c_str());
+    Environment::Instance().setTestConfig(config);
+    config.load_file(FLAGS_env_conf.c_str());
+    Environment::Instance().setEnvConfig(config);
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/tests/stress_tests/memleaks_tests/tests.cpp b/tests/stress_tests/memleaks_tests/tests.cpp
new file mode 100644 (file)
index 0000000..99416be
--- /dev/null
@@ -0,0 +1,123 @@
+#include "../common/tests_utils.h"
+#include "../common/managers/thread_manager.h"
+#include "tests_pipelines/tests_pipelines.h"
+
+#include <inference_engine.hpp>
+
+#include <gtest/gtest.h>
+
+using namespace InferenceEngine;
+
+class MemLeaksTestSuiteNoModel : public ::testing::TestWithParam<TestCase> {
+};
+
+class MemLeaksTestSuiteNoDevice : public ::testing::TestWithParam<TestCase> {
+};
+
+class MemLeaksTestSuite : public ::testing::TestWithParam<TestCase> {
+};
+
+inline void test_runner(int numthreads, const std::function<TestResult()> &test_function) {
+    ThreadManager<TestResult> thr_manager;
+    for (int i = 0; i < numthreads; i++)
+        thr_manager.add_task(test_function);
+    thr_manager.run_parallel_n_wait();
+
+    std::vector<ManagerStatus> statuses = thr_manager.get_all_statuses();
+    std::vector<TestResult> results = thr_manager.get_all_results();
+
+    for (int i = 0; i < numthreads; i++) {
+        EXPECT_EQ(statuses[i], ManagerStatus::FINISHED_SUCCESSFULLY)
+                            << "[Thread " << i << "] Thread not finished successfully";
+        EXPECT_EQ(results[i].first, TestStatus::TEST_OK) << "[Thread " << i << "] " << results[i].second;
+    }
+}
+
+
+// tests_pipelines/tests_pipelines.cpp
+TEST_P(MemLeaksTestSuiteNoModel, load_unload_plugin) {
+    auto test_params = GetParam();
+    auto test = [&] {
+        return test_load_unload_plugin(test_params.device, test_params.numiters);
+    };
+    test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuiteNoDevice, read_network) {
+    auto test_params = GetParam();
+    auto test = [&] {
+        return test_read_network(test_params.model, test_params.numiters);
+    };
+    test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuiteNoDevice, create_cnnnetwork) {
+    auto test_params = GetParam();
+    auto test = [&] {
+        return test_create_cnnnetwork(test_params.model, test_params.numiters);
+    };
+    test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuiteNoDevice, cnnnetwork_reshape_batch_x2) {
+    auto test_params = GetParam();
+    auto test = [&] {
+        return test_cnnnetwork_reshape_batch_x2(test_params.model, test_params.numiters);
+    };
+    test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuiteNoDevice, set_input_params) {
+    auto test_params = GetParam();
+    auto test = [&] {
+        return test_set_input_params(test_params.model, test_params.numiters);
+    };
+    test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuite, recreate_exenetwork) {
+    auto test_params = GetParam();
+    Core ie;
+    auto test = [&] {
+        return test_recreate_exenetwork(ie, test_params.model, test_params.device, test_params.numiters);
+    };
+    test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuite, recreate_infer_request) {
+    auto test_params = GetParam();
+    Core ie;
+    CNNNetwork cnnNetwork = ie.ReadNetwork(test_params.model);
+    ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, test_params.device);
+    auto test = [&] {
+        return test_recreate_infer_request(exeNetwork, test_params.model, test_params.device, test_params.numiters);
+    };
+    test_runner(test_params.numthreads, test);
+}
+
+TEST_P(MemLeaksTestSuite, reinfer_request_inference) {
+    auto test_params = GetParam();
+    auto test = [&] {
+        Core ie;
+        CNNNetwork cnnNetwork = ie.ReadNetwork(test_params.model);
+        ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, test_params.device);
+        InferRequest infer_request = exeNetwork.CreateInferRequest();
+        return test_reinfer_request_inference(infer_request, cnnNetwork, test_params.model, test_params.device, test_params.numiters);
+    };
+    test_runner(test_params.numthreads, test);
+}
+// tests_pipelines/tests_pipelines.cpp
+
+INSTANTIATE_TEST_CASE_P(MemLeaksTests, MemLeaksTestSuiteNoModel,
+                        ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "devices"})),
+                        getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(MemLeaksTests, MemLeaksTestSuiteNoDevice,
+                        ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "models"})),
+                        getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(MemLeaksTests, MemLeaksTestSuite,
+                        ::testing::ValuesIn(
+                                generateTestsParams({"processes", "threads", "iterations", "devices", "models"})),
+                        getTestCaseName);
+
diff --git a/tests/stress_tests/memleaks_tests/tests_pipelines/tests_pipelines.cpp b/tests/stress_tests/memleaks_tests/tests_pipelines/tests_pipelines.cpp
new file mode 100644 (file)
index 0000000..49e60d3
--- /dev/null
@@ -0,0 +1,185 @@
+#include "tests_pipelines.h"
+
+#include <math.h>
+
+#include <inference_engine.hpp>
+#include <algorithm>
+#include <array>
+#include <string>
+
+using namespace InferenceEngine;
+
+// Number of pipeline runs before it starts measuring
+#define WARMUP_STEPS 30
+// Number memory peaks ignored. LibC memory manager can produce peaks with
+// overall flat consumption
+#define MAX_OUTLIERS 5
+// Maximum number of measuring pipeline restarts
+#define MAX_RETRY 3
+// A threshold for which memory growth will be considered an error
+#define THRESHOLD 0.1
+
+// Measure values
+enum MeasureValue { VMRSS = 0, VMHWM, VMSIZE, VMPEAK, MeasureValueMax };
+
+namespace util {    
+template <typename In, typename Out, typename Func>
+void transform(const In& in, Out& out, const Func& func) {
+    std::transform(std::begin(in), std::end(in), std::begin(out), func);
+}
+
+template <typename In1, typename In2, typename Out, typename Func>
+void transform(const In1& in1, const In2& in2, Out& out, const Func& func) {
+    std::transform(std::begin(in1), std::end(in1), std::begin(in2), std::begin(out), func);
+}
+}  // namespace util
+
+TestResult common_test_pipeline(const std::function<void()>& test_pipeline, const int& n) {
+    int retry_count = 0;
+    float mem_threshold = THRESHOLD;
+    std::array<long, MeasureValueMax> cur = {0};           // measured for current iteration
+    std::array<long, MeasureValueMax> ref = {0};           // recorded reference
+    std::array<long, MeasureValueMax> diff = {0};          // difference between current and reference
+    std::array<bool, MeasureValueMax> outlier = {0};       // flag if current does not fit threshold
+    std::array<int, MeasureValueMax> outlier_count = {0};  // counter for how many times current does not fit threshold
+    std::array<float, MeasureValueMax> threshold = {0};    // ref * THRESHOLD
+    std::string progress_str;
+
+    progress_str.reserve(1024);
+
+    log_info("Warming up for " << WARMUP_STEPS << " iterations");
+    log_info("i\tVMRSS\tVMHWM\tVMSIZE\tVMPEAK");
+    int measure_count = n;
+    for (int iteration = 0; measure_count > 0; iteration++) {
+        // Warm up to take reference values
+        test_pipeline();
+        getVmValues(cur[VMSIZE], cur[VMPEAK], cur[VMRSS], cur[VMHWM]);
+        progress_str = std::to_string(iteration + 1) + "\t" + std::to_string(cur[VMRSS]) + "\t" +
+                       std::to_string(cur[VMHWM]) + "\t" + std::to_string(cur[VMSIZE]) + "\t" +
+                       std::to_string(cur[VMPEAK]);
+
+        // measure
+        if (iteration >= WARMUP_STEPS) {
+            // set reference
+            if (WARMUP_STEPS == iteration || (retry_count < MAX_RETRY && (outlier_count[VMRSS] > MAX_OUTLIERS ||
+                                                                          outlier_count[VMHWM] > MAX_OUTLIERS))) {
+                if (0 != retry_count) log_info("Retrying " << retry_count + 1 << " of " << MAX_RETRY);
+                retry_count++;
+                measure_count = n;
+                outlier_count = {0};
+                ref = cur;
+                util::transform(ref, threshold, [](long ref_val) -> float {
+                    return THRESHOLD * ref_val;
+                });
+                log_info("Setting thresholds VMRSS=" << ref[VMRSS] << "(+-" << static_cast<int>(threshold[VMRSS])
+                                                     << "), VMHWM=" << ref[VMHWM] << "(+-"
+                                                     << static_cast<int>(threshold[VMHWM]) << ")");
+            }
+            measure_count--;
+            util::transform(cur, ref, diff, [](long cur_val, long ref_val) -> long {
+                return labs(cur_val - ref_val);
+            });
+            util::transform(diff, threshold, outlier, [](long diff_val, float threshold_val) -> bool {
+                return diff_val > threshold_val;
+            });
+            util::transform(outlier, outlier_count, outlier_count,
+                            [](bool outlier_val, long outlier_count_val) -> long {
+                                return outlier_count_val + (outlier_val ? 1 : 0);
+                            });
+
+            if (outlier[VMRSS]) {
+                progress_str += "\t<-VMRSS outlier";
+            }
+            if (outlier[VMHWM]) {
+                progress_str += "\t<-VMHWM outlier";
+            }
+        }
+
+        log_info(progress_str);
+    }
+
+    if (outlier_count[VMRSS] > MAX_OUTLIERS)
+        return TestResult(TestStatus::TEST_FAILED, "Test failed: RSS virtual memory consumption grown too much.");
+
+    if (outlier_count[VMHWM] > MAX_OUTLIERS)
+        return TestResult(TestStatus::TEST_FAILED, "Test failed: HWM virtual memory consumption grown too much.");
+
+    return TestResult(TestStatus::TEST_OK, "");
+}
+
+
+TestResult test_load_unload_plugin(const std::string &target_device, const int &n) {
+    log_info("Load/unload plugin for device: " << target_device << " for " << n << " times");
+    return common_test_pipeline(load_unload_plugin(target_device), n);
+}
+
+TestResult test_read_network(const std::string &model, const int &n) {
+    log_info("Read network: \"" << model << "\" for " << n << " times");
+    return common_test_pipeline(read_network(model), n);
+}
+
+TestResult test_create_cnnnetwork(const std::string &model, const int &n) {
+    log_info("Create CNNNetwork from network: \"" << model << "\" for " << n << " times");
+    return common_test_pipeline(create_cnnnetwork(model), n);
+}
+
+TestResult test_cnnnetwork_reshape_batch_x2(const std::string &model, const int &n) {
+    log_info("Reshape to batch*=2 of CNNNetwork created from network: \"" << model << "\" for " << n << " times");
+    return common_test_pipeline(cnnnetwork_reshape_batch_x2(model), n);
+}
+
+TestResult test_set_input_params(const std::string &model, const int &n) {
+    log_info("Apply preprocessing for CNNNetwork from network: \"" << model << "\" for " << n << " times");
+    return common_test_pipeline(set_input_params(model), n);
+}
+
+TestResult test_create_exenetwork(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Create ExecutableNetwork from network: \"" << model
+                                                         << "\" for device: \"" << target_device << "\" for " << n
+                                                         << " times");
+    return common_test_pipeline(create_exenetwork(model, target_device), n);
+}
+
+TestResult
+test_recreate_exenetwork(InferenceEngine::Core &ie, const std::string &model, const std::string &target_device,
+                         const int &n) {
+    log_info("Recreate ExecutableNetwork from network within existing InferenceEngine::Core: \"" << model
+                                                                                                 << "\" for device: \""
+                                                                                                 << target_device
+                                                                                                 << "\" for " << n
+                                                                                                 << " times");
+    return common_test_pipeline(recreate_exenetwork(ie, model, target_device), n);
+}
+
+TestResult test_create_infer_request(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Create InferRequest from network: \"" << model
+                                                    << "\" for device: \"" << target_device << "\" for " << n
+                                                    << " times");
+    return common_test_pipeline(create_infer_request(model, target_device), n);
+}
+
+TestResult
+test_recreate_infer_request(ExecutableNetwork &network, const std::string &model, const std::string &target_device,
+                            const int &n) {
+    log_info("Create InferRequest from network: \"" << model
+                                                    << "\" for device: \"" << target_device << "\" for " << n
+                                                    << " times");
+    return common_test_pipeline(recreate_infer_request(network), n);
+}
+
+TestResult
+test_infer_request_inference(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Inference of InferRequest from network: \"" << model
+                                                          << "\" for device: \"" << target_device << "\" for " << n
+                                                          << " times");
+    return common_test_pipeline(infer_request_inference(model, target_device), n);
+}
+
+TestResult
+test_reinfer_request_inference(InferenceEngine::InferRequest &infer_request, InferenceEngine::CNNNetwork &cnnNetwork,
+                               const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Inference of InferRequest from network: \"" << model
+                                                          << "\" for device: \"" << target_device << "\" for " << n
+                                                          << " times");
+    return common_test_pipeline(reinfer_request_inference(infer_request, cnnNetwork), n);
+}
diff --git a/tests/stress_tests/memleaks_tests/tests_pipelines/tests_pipelines.h b/tests/stress_tests/memleaks_tests/tests_pipelines/tests_pipelines.h
new file mode 100644 (file)
index 0000000..2d144af
--- /dev/null
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "../../common/tests_utils.h"
+#include "../../common/utils.h"
+#include "../../common/ie_pipelines/pipelines.h"
+
+#include <string>
+
+#include <inference_engine.hpp>
+
+// tests_pipelines/tests_pipelines.cpp
+TestResult test_load_unload_plugin(const std::string &target_device, const int &n);
+TestResult test_read_network(const std::string &model, const int &n);
+TestResult test_create_cnnnetwork(const std::string &model, const int &n);
+TestResult test_cnnnetwork_reshape_batch_x2(const std::string &model, const int &n);
+TestResult test_set_input_params(const std::string &model, const int &n);
+TestResult test_recreate_exenetwork(InferenceEngine::Core &ie, const std::string &model, const std::string &target_device, const int &n);
+TestResult test_create_infer_request(const std::string &model, const std::string &target_device, const int &n);
+TestResult test_recreate_infer_request(InferenceEngine::ExecutableNetwork& network, const std::string &model, const std::string &target_device, const int &n);
+TestResult test_infer_request_inference(const std::string &model, const std::string &target_device, const int &n);
+TestResult test_reinfer_request_inference(InferenceEngine::InferRequest& infer_request, InferenceEngine::CNNNetwork& cnnNetwork, const std::string &model, const std::string &target_device, const int &n);
+// tests_pipelines/tests_pipelines.cpp
diff --git a/tests/stress_tests/scripts/get_testdata.py b/tests/stress_tests/scripts/get_testdata.py
new file mode 100644 (file)
index 0000000..0c73b26
--- /dev/null
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+""" Script to acquire model IRs for stress tests.
+Usage: ./scrips/get_testdata.py
+"""
+import argparse
+import multiprocessing
+import os
+import shutil
+import subprocess
+from inspect import getsourcefile
+
+# Parameters
+MODEL_NAMES = 'vgg16,mtcnn-r,mobilenet-ssd,ssd300'
+OMZ_VERSION = 'efd238d02035f8a5417b7b1e25cd4c997d44351f'
+
+
+def abs_path(relative_path):
+    """Return absolute path given path relative to the current file.
+    """
+    return os.path.realpath(
+        os.path.join(os.path.dirname(getsourcefile(lambda: 0)), relative_path))
+
+
+def main():
+    """Main entry point.
+    """
+    parser = argparse.ArgumentParser(
+        description='Acquire test data',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('--output_dir', default=f'./_models', help='directory to put test data into')
+    parser.add_argument('--cache_dir', default=f'./_cache', help='directory with test data cache')
+    args = parser.parse_args()
+
+    # Clone Open Model Zoo into temporary path
+    omz_path = './_open_model_zoo'
+    if os.path.exists(omz_path):
+        shutil.rmtree(omz_path)
+    subprocess.check_call(
+        f'git clone https://github.com/opencv/open_model_zoo {omz_path}' \
+        f' && cd {omz_path}'\
+        f' && git checkout {OMZ_VERSION}', shell=True)
+    # Acquire model IRs
+    mo_tool = abs_path('../../../model-optimizer/mo.py')
+    subprocess.check_call(
+        f'{omz_path}/tools/downloader/downloader.py --name "{MODEL_NAMES}"' \
+        f' --output_dir {args.output_dir}/{OMZ_VERSION}/models' \
+        f' --cache_dir {args.cache_dir}', shell=True)
+    subprocess.check_call(
+        f'{omz_path}/tools/downloader/converter.py --name "{MODEL_NAMES}"' \
+        f' --output_dir {args.output_dir}/{OMZ_VERSION}/IRs' \
+        f' --download_dir {args.output_dir}/{OMZ_VERSION}/models' \
+        f' --mo {mo_tool} --jobs {multiprocessing.cpu_count()}', shell=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/stress_tests/unittests/CMakeLists.txt b/tests/stress_tests/unittests/CMakeLists.txt
new file mode 100644 (file)
index 0000000..f41ba48
--- /dev/null
@@ -0,0 +1,40 @@
+# Copyright (C) 2018-2020 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+
+set (TARGET_NAME "StressUnitTests")
+
+file (GLOB SRC
+        ../common/*.cpp
+        ../common/ie_pipelines/*.cpp
+        *.cpp
+        tests_pipelines/*.cpp)
+
+file (GLOB HDR
+        ../common/*.h
+        ../common/managers/*.h
+        ../common/ie_pipelines/*.h
+        *.h
+        tests_pipelines/*.h)
+
+# Create library file from sources.
+add_executable(${TARGET_NAME} ${HDR} ${SRC})
+
+find_package(gflags REQUIRED)
+find_package(Threads REQUIRED)
+
+target_link_libraries(${TARGET_NAME}
+        IE::gtest
+        IE::gtest_main
+        IE::pugixml
+        gflags
+        Threads::Threads
+        ${InferenceEngine_LIBRARIES}
+        )
+
+target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}"
+        "${IE_MAIN_SOURCE_DIR}/thirdparty/pugixml/src")
+
+# Copy local configs to BIN_FOLDER
+configure_file(local_configs/test_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/unittests/test_config.xml COPYONLY)
+configure_file(local_configs/env_config.xml ${OUTPUT_ROOT}/${BIN_FOLDER}/${CMAKE_BUILD_TYPE}/stress_tests_configs/unittests/env_config.xml COPYONLY)
diff --git a/tests/stress_tests/unittests/flags.h b/tests/stress_tests/unittests/flags.h
new file mode 100644 (file)
index 0000000..7f4ff8a
--- /dev/null
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../common/utils.h"
+
+#include <gflags/gflags.h>
+
+/// @brief message for help argument
+static const char help_message[] = "Print a usage message";
+
+/// @brief Define flag for showing help message <br>
+DEFINE_bool(h, false, help_message);
+
+/// @brief Declare flag for showing help message <br>
+DECLARE_bool(help);
+
+/// @brief message for test_config argument
+static const char test_conf_message[] = "Optional. Path to a test config with description about number of threads, iterations etc.";
+
+/// @brief Define parameter for set test's configuration <br>
+/// test_conf is an optional parameter
+DEFINE_string(test_conf, OS_PATH_JOIN({"stress_tests_configs", "unittests", "test_config.xml"}), test_conf_message);
+
+/// @brief message for env_config argument
+static const char env_conf_message[] = "Optional. Path to an env config with paths to models etc.";
+
+/// @brief Define parameter for set environment <br>
+/// env_conf is an optional parameter
+DEFINE_string(env_conf, OS_PATH_JOIN({"stress_tests_configs", "unittests", "env_config.xml"}), env_conf_message);
\ No newline at end of file
diff --git a/tests/stress_tests/unittests/local_configs/env_config.xml b/tests/stress_tests/unittests/local_configs/env_config.xml
new file mode 100644 (file)
index 0000000..7d356d0
--- /dev/null
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<attributes>
+    <irs_path>
+        <value>/nfs/inn/proj/vdp/vdp_tests/stress_tests/master_04d6f112132f92cab563ae7655747e0359687dc9/</value>
+    </irs_path>
+</attributes>
diff --git a/tests/stress_tests/unittests/local_configs/test_config.xml b/tests/stress_tests/unittests/local_configs/test_config.xml
new file mode 100644 (file)
index 0000000..162f6f1
--- /dev/null
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<attributes>
+    <processes>
+        <value>1</value>
+    </processes>
+    <threads>
+        <value>1</value>
+    </threads>
+    <iterations>
+        <value>100</value>
+    </iterations>
+    <devices>
+        <value>CPU</value>
+        <value>GPU</value>
+    </devices>
+    <models>
+        <value>caffe/FP32/alexnet/alexnet.xml</value>
+    </models>
+</attributes>
diff --git a/tests/stress_tests/unittests/main.cpp b/tests/stress_tests/unittests/main.cpp
new file mode 100644 (file)
index 0000000..3d23a64
--- /dev/null
@@ -0,0 +1,57 @@
+#include "flags.h"
+#include "../common/utils.h"
+#include "../common/tests_utils.h"
+
+#include <gtest/gtest.h>
+#include <pugixml.hpp>
+
+
+bool parseAndCheckCommandLine(int argc, char **argv) {
+    // ---------------------------Parsing and validating input arguments--------------------------------------
+    log_info("Parsing input parameters");
+
+    int new_argc = 0;
+    std::vector<char*> _argv;
+    for (int i = 0; i < argc; i++) {
+        if ("--gtest" != std::string(argv[i]).substr(0, 7)) {
+            _argv.push_back(argv[i]);
+            new_argc++;
+        }
+    }
+    char **new_argv = &_argv[0];
+    gflags::ParseCommandLineNonHelpFlags(&new_argc, &new_argv, true);
+
+    if (FLAGS_help || FLAGS_h) {
+        // TODO print info
+        //::testing::InitGoogleTest(&argc, argv);
+        return false;
+    }
+
+    pugi::xml_document config;
+    pugi::xml_parse_result result = config.load_file(FLAGS_test_conf.c_str());
+    if (!result) {
+        log_err("Exception while reading test config \"" << FLAGS_test_conf << "\": " << result.description());
+        return false;
+    }
+    result = config.load_file(FLAGS_env_conf.c_str());
+    if (!result) {
+        log_err("Exception while reading env config \"" << FLAGS_env_conf << "\": " << result.description());
+        return false;
+    }
+    return true;
+}
+
+
+int main(int argc, char **argv) {
+    if (!parseAndCheckCommandLine(argc, argv)) {
+        return 0;   // TODO return correct status
+    }
+
+    pugi::xml_document config;
+    config.load_file(FLAGS_test_conf.c_str());
+    Environment::Instance().setTestConfig(config);
+    config.load_file(FLAGS_env_conf.c_str());
+    Environment::Instance().setEnvConfig(config);
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/tests/stress_tests/unittests/tests.cpp b/tests/stress_tests/unittests/tests.cpp
new file mode 100644 (file)
index 0000000..f710d36
--- /dev/null
@@ -0,0 +1,94 @@
+#include "../common/tests_utils.h"
+#include "tests_pipelines/tests_pipelines.h"
+
+#include <gtest/gtest.h>
+
+class UnitTestSuiteNoModel : public ::testing::TestWithParam<TestCase> {
+};
+
+class UnitTestSuiteNoDevice : public ::testing::TestWithParam<TestCase> {
+};
+
+class UnitTestSuite : public ::testing::TestWithParam<TestCase> {
+};
+
+// tests_pipelines/tests_pipelines.cpp
+TEST_P(UnitTestSuiteNoModel, load_unload_plugin) {
+    runTest(test_load_unload_plugin, GetParam());
+}
+
+TEST_P(UnitTestSuiteNoDevice, read_network) {
+    runTest(test_read_network, GetParam());
+}
+
+TEST_P(UnitTestSuiteNoDevice, create_cnnnetwork) {
+    runTest(test_create_cnnnetwork, GetParam());
+}
+
+TEST_P(UnitTestSuiteNoDevice, cnnnetwork_reshape_batch_x2) {
+    runTest(test_cnnnetwork_reshape_batch_x2, GetParam());
+}
+
+TEST_P(UnitTestSuiteNoDevice, set_input_params) {
+    runTest(test_set_input_params, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_exenetwork) {
+    runTest(test_create_exenetwork, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_infer_request) {
+    runTest(test_create_infer_request, GetParam());
+}
+
+TEST_P(UnitTestSuite, infer_request_inference) {
+    runTest(test_infer_request_inference, GetParam());
+}
+// tests_pipelines/tests_pipelines.cpp
+
+
+// tests_pipelines/tests_pipelines_full_pipeline.cpp
+TEST_P(UnitTestSuite, load_unload_plugin_full_pipeline) {
+    runTest(test_load_unload_plugin_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, read_network_full_pipeline) {
+    runTest(test_read_network_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_cnnnetwork_full_pipeline) {
+    runTest(test_create_cnnnetwork_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, set_input_params_full_pipeline) {
+    runTest(test_set_input_params_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, cnnnetwork_reshape_batch_x2_full_pipeline) {
+    runTest(test_cnnnetwork_reshape_batch_x2_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_exenetwork_full_pipeline) {
+    runTest(test_create_exenetwork_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, create_infer_request_full_pipeline) {
+    runTest(test_create_infer_request_full_pipeline, GetParam());
+}
+
+TEST_P(UnitTestSuite, infer_request_inference_full_pipeline) {
+    runTest(test_infer_request_inference_full_pipeline, GetParam());
+}
+// tests_pipelines/tests_pipelines_full_pipeline.cpp
+
+INSTANTIATE_TEST_CASE_P(StressUnitTests, UnitTestSuiteNoModel,
+                        ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "devices"})),
+                        getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(StressUnitTests, UnitTestSuiteNoDevice,
+                        ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "models"})),
+                        getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(StressUnitTests, UnitTestSuite,
+                        ::testing::ValuesIn(generateTestsParams({"processes", "threads", "iterations", "devices", "models"})),
+                        getTestCaseName);
diff --git a/tests/stress_tests/unittests/tests_pipelines/tests_pipelines.cpp b/tests/stress_tests/unittests/tests_pipelines/tests_pipelines.cpp
new file mode 100644 (file)
index 0000000..afa4845
--- /dev/null
@@ -0,0 +1,91 @@
+#include "tests_pipelines.h"
+
+#include <string>
+
+#include <inference_engine.hpp>
+
+
+using namespace InferenceEngine;
+
+void test_load_unload_plugin(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Load/unload plugin for device: " << target_device << " for " << n << " times");
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        load_unload_plugin(target_device)();
+    }
+}
+
+void test_read_network(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Read network: \"" << model << "\" for " << n << " times");
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        read_network(model)();
+    }
+}
+
+void test_create_cnnnetwork(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Create CNNNetwork from network: \"" << model << "\" for " << n << " times");
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        create_cnnnetwork(model)();
+    }
+}
+
+void test_cnnnetwork_reshape_batch_x2(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Reshape to batch*=2 of CNNNetwork created from network: \"" << model << "\" for " << n << " times");
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        cnnnetwork_reshape_batch_x2(model)();
+    }
+}
+
+void test_set_input_params(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Apply preprocessing for CNNNetwork from network: \"" << model << "\" for " << n << " times");
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        set_input_params(model)();
+    }
+}
+
+void test_create_exenetwork(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Create ExecutableNetwork from network: \"" << model
+             << "\" for device: \"" << target_device << "\" for " << n << " times");
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        create_exenetwork(model, target_device)();
+    }
+}
+
+void test_create_infer_request(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Create InferRequest from network: \"" << model
+             << "\" for device: \"" << target_device << "\" for " << n << " times");
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        create_infer_request(model, target_device)();
+    }
+}
+
+void test_infer_request_inference(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Inference of InferRequest from network: \"" << model
+             << "\" for device: \"" << target_device << "\" for " << n << " times");
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        infer_request_inference(model, target_device)();
+    }
+}
diff --git a/tests/stress_tests/unittests/tests_pipelines/tests_pipelines.h b/tests/stress_tests/unittests/tests_pipelines/tests_pipelines.h
new file mode 100644 (file)
index 0000000..7e5ef80
--- /dev/null
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "../../common/utils.h"
+#include "../../common/ie_pipelines/pipelines.h"
+
+#include <string>
+
+// tests_pipelines/tests_pipelines.cpp
+void test_load_unload_plugin(const std::string &model, const std::string &target_device, const int &n);
+void test_read_network(const std::string &model, const std::string &target_device, const int &n);
+void test_create_cnnnetwork(const std::string &model, const std::string &target_device, const int &n);
+void test_cnnnetwork_reshape_batch_x2(const std::string &model, const std::string &target_device, const int &n);
+void test_set_input_params(const std::string &model, const std::string &target_device, const int &n);
+void test_create_exenetwork(const std::string &model, const std::string &target_device, const int &n);
+void test_create_infer_request(const std::string &model, const std::string &target_device, const int &n);
+void test_infer_request_inference(const std::string &model, const std::string &target_device, const int &n);
+// tests_pipelines/tests_pipelines.cpp
+
+// tests_pipelines/tests_pipelines_full_pipeline.cpp
+void test_load_unload_plugin_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_read_network_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_create_cnnnetwork_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_set_input_params_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_cnnnetwork_reshape_batch_x2_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_create_exenetwork_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_create_infer_request_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+void test_infer_request_inference_full_pipeline(const std::string &model, const std::string &target_device, const int &n);
+// tests_pipelines/tests_pipelines_full_pipeline.cpp
diff --git a/tests/stress_tests/unittests/tests_pipelines/tests_pipelines_full_pipeline.cpp b/tests/stress_tests/unittests/tests_pipelines/tests_pipelines_full_pipeline.cpp
new file mode 100644 (file)
index 0000000..26a82b1
--- /dev/null
@@ -0,0 +1,266 @@
+#include "tests_pipelines.h"
+
+#include <string>
+
+#include <inference_engine.hpp>
+
+using namespace InferenceEngine;
+
+#define batchIndex 0
+
+#define setInputParameters()                                                        \
+    input.second->getPreProcess().setResizeAlgorithm(NO_RESIZE);                    \
+    input.second->setPrecision(Precision::U8);                                      \
+    if (input.second->getInputData()->getTensorDesc().getDims().size() == 4)        \
+        input.second->setLayout(Layout::NCHW);                                      \
+    else if (input.second->getInputData()->getTensorDesc().getDims().size() == 2)   \
+        input.second->setLayout(Layout::NC);
+
+#define computeShapesToReshape()                                \
+    auto layout = input.second->getTensorDesc().getLayout();    \
+    if ((layout == Layout::NCHW) || (layout == Layout::NC)) {   \
+        shapes[input.first][batchIndex] *= 2;                   \
+        doReshape = true;                                       \
+    }
+
+#define reshapeCNNNetwork()                                             \
+    if (doReshape)                                                      \
+        cnnNetwork.reshape(shapes);                                     \
+    else                                                                \
+        throw std::logic_error("Reshape wasn't applied for a model.");
+
+void test_load_unload_plugin_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Load/unload plugin for device: " << target_device << " for " << n << " times");
+    Core ie;
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        // GetVersions silently register plugin in `plugins` through `GetCPPPluginByName`
+        ie.GetVersions(target_device);
+        // Remove plugin for target_device from `plugins`
+        ie.UnregisterPlugin(target_device);
+    }
+    CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+    ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+    bool doReshape = false;
+    for (auto &input : inputInfo) {
+        setInputParameters();
+        computeShapesToReshape();
+    }
+    reshapeCNNNetwork();
+    ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    InferRequest infer_request = exeNetwork.CreateInferRequest();
+    infer_request.Infer();
+    OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+    for (auto &output : output_info)
+        Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_read_network_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Read network: \"" << model << "\" for " << n << " times");
+    Core ie;
+    IE_SUPPRESS_DEPRECATED_START
+    std::shared_ptr<CNNNetReader> netReaderPtr;
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        CNNNetReader netReader;
+        netReader.ReadNetwork(model);
+        netReader.ReadWeights(fileNameNoExt(model) + ".bin");
+        netReaderPtr = std::make_shared<CNNNetReader>(netReader);
+    }
+    CNNNetwork cnnNetwork = netReaderPtr->getNetwork();
+    IE_SUPPRESS_DEPRECATED_END
+    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+    ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+    bool doReshape = false;
+    for (auto &input : inputInfo) {
+        setInputParameters();
+        computeShapesToReshape();
+    }
+    reshapeCNNNetwork();
+    ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    InferRequest infer_request = exeNetwork.CreateInferRequest();
+    infer_request.Infer();
+    OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+    for (auto &output : output_info)
+        Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_create_cnnnetwork_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Create CNNNetwork from network: \"" << model << "\" for " << n << " times");
+    Core ie;
+    CNNNetwork cnnNetwork;
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        cnnNetwork = ie.ReadNetwork(model);
+    }
+    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+    ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+    bool doReshape = false;
+    for (auto &input : inputInfo) {
+        setInputParameters();
+        computeShapesToReshape();
+    }
+    reshapeCNNNetwork();
+    ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    InferRequest infer_request = exeNetwork.CreateInferRequest();
+    infer_request.Infer();
+    OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+    for (auto &output : output_info)
+        Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_set_input_params_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Apply preprocessing for CNNNetwork from network: \"" << model << "\" for " << n << " times");
+    Core ie;
+    CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        for (auto &input : inputInfo) {
+            setInputParameters();
+        }
+    }
+    ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+    bool doReshape = false;
+    for (auto &input : inputInfo) {
+        computeShapesToReshape();
+    }
+    reshapeCNNNetwork();
+    ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    InferRequest infer_request = exeNetwork.CreateInferRequest();
+    infer_request.Infer();
+    OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+    for (auto &output : output_info)
+        Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_cnnnetwork_reshape_batch_x2_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Reshape to batch*=2 of CNNNetwork created from network: \"" << model << "\" for " << n << " times");
+    Core ie;
+    CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+    for (auto &input : inputInfo) {
+        setInputParameters();
+    }
+    ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+    bool doReshape = false;
+    int prev_batch = -1, new_batch;
+    for (auto &input : inputInfo) {
+        auto layout = input.second->getTensorDesc().getLayout();
+        if ((layout == Layout::NCHW) || (layout == Layout::NC))
+            prev_batch = shapes[input.first][batchIndex];
+    }
+    if (prev_batch == -1)
+        throw std::logic_error("Reshape wasn't applied for a model.");
+
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+
+        new_batch = ((i % 2) == 0) ? prev_batch * 2 : prev_batch;
+        for (auto &input : inputInfo) {
+            auto layout = input.second->getTensorDesc().getLayout();
+            if ((layout == Layout::NCHW) || (layout == Layout::NC)) {
+                shapes[input.first][batchIndex] = new_batch;
+                doReshape = true;
+            }
+        }
+        reshapeCNNNetwork();
+    }
+    ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    InferRequest infer_request = exeNetwork.CreateInferRequest();
+    infer_request.Infer();
+    OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+    for (auto &output : output_info)
+        Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_create_exenetwork_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Create ExecutableNetwork from network: \"" << model
+             << "\" for device: \"" << target_device << "\" for " << n << " times");
+    Core ie;
+    CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+    ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+    bool doReshape = false;
+    for (auto &input : inputInfo) {
+        setInputParameters();
+        computeShapesToReshape();
+    }
+    reshapeCNNNetwork();
+    ExecutableNetwork exeNetwork;
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    }
+    InferRequest infer_request = exeNetwork.CreateInferRequest();
+    infer_request.Infer();
+    OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+    for (auto &output : output_info)
+        Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_create_infer_request_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Create InferRequest from network: \"" << model
+             << "\" for device: \"" << target_device << "\" for " << n << " times");
+    Core ie;
+    CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+    ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+    bool doReshape = false;
+    for (auto &input : inputInfo) {
+        setInputParameters();
+        computeShapesToReshape();
+    }
+    reshapeCNNNetwork();
+    ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    InferRequest infer_request;
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        infer_request = exeNetwork.CreateInferRequest();
+    }
+    infer_request.Infer();
+    OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+    for (auto &output : output_info)
+        Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+}
+
+void test_infer_request_inference_full_pipeline(const std::string &model, const std::string &target_device, const int &n) {
+    log_info("Inference of InferRequest from network: \"" << model
+             << "\" for device: \"" << target_device << "\" for " << n << " times");
+    Core ie;
+    CNNNetwork cnnNetwork = ie.ReadNetwork(model);
+    InputsDataMap inputInfo(cnnNetwork.getInputsInfo());
+    ICNNNetwork::InputShapes shapes = cnnNetwork.getInputShapes();
+    bool doReshape = false;
+    for (auto &input : inputInfo) {
+        setInputParameters();
+        computeShapesToReshape();
+    }
+    reshapeCNNNetwork();
+    ExecutableNetwork exeNetwork = ie.LoadNetwork(cnnNetwork, target_device);
+    InferRequest infer_request = exeNetwork.CreateInferRequest();
+    for (int i = 0; i < n; i++) {
+        if (i == n / 2) {
+            log_info("Half of the test have already passed");
+        }
+        infer_request.Infer();
+        OutputsDataMap output_info(cnnNetwork.getOutputsInfo());
+        for (auto &output : output_info)
+            Blob::Ptr outputBlob = infer_request.GetBlob(output.first);
+    }
+}
index 181fb12..b3fc7b3 100644 (file)
@@ -108,18 +108,30 @@ Options:
                        Default value is determined automatically for a device. 
                        Please note that although the automatic selection usually provides a reasonable performance, 
                        it still may be non-optimal for some cases, especially for very small networks.
+  -enforcebf16 [ENFORCE_BFLOAT16], --enforce_bfloat16 [ENFORCE_BFLOAT16]
+                        Optional. Enforcing of floating point operations
+                        execution in bfloat16 precision where it is acceptable.
   -nthreads NUMBER_THREADS, --number_threads NUMBER_THREADS
                         Number of threads to use for inference on the CPU
                         (including HETERO  and MULTI cases).
-  -pin {YES,NO}, --infer_threads_pinning {YES,NO}
-                        Optional. Enable ("YES" is default value) or disable
-                        ("NO")CPU threads pinning for CPU-involved inference.
+  -pin {YES,NO,NUMA}, --infer_threads_pinning {YES,NO,NUMA}
+                        Optional. Enable threads->cores ('YES' is default
+                        value), threads->(NUMA)nodes ('NUMA') or completely
+                        disable ('NO')CPU threads pinning for CPU-involved
+                        inference.
   --exec_graph_path EXEC_GRAPH_PATH
                         Optional. Path to a file where to store executable
                         graph information serialized.
   -pc [PERF_COUNTS], --perf_counts [PERF_COUNTS]
                         Optional. Report performance counters.
-
+  -dump_config DUMP_CONFIG
+                        Optional. Path to JSON file to dump IE parameters,
+                        which were set by application.
+  -load_config LOAD_CONFIG
+                        Optional. Path to JSON file to load custom IE
+                        parameters. Please note, command line parameters have
+                        higher priority then parameters from configuration
+                        file.
 ```
 
 Running the application with the empty list of options yields the usage message given above and an error message.
index ae4d746..1682bee 100644 (file)
@@ -18,34 +18,33 @@ from datetime import datetime
 from statistics import median
 from openvino.inference_engine import IENetwork, IECore, get_version, StatusCode
 
-from .utils.constants import CPU_DEVICE_NAME, MULTI_DEVICE_NAME, GPU_DEVICE_NAME, MYRIAD_DEVICE_NAME, BIN_EXTENSION
+from .utils.constants import MULTI_DEVICE_NAME, HETERO_DEVICE_NAME, CPU_DEVICE_NAME, GPU_DEVICE_NAME, BIN_EXTENSION
 from .utils.logging import logger
-from .utils.utils import get_duration_seconds, parse_nstreams_value_per_device, parse_devices
+from .utils.utils import get_duration_seconds
 from .utils.inputs_filling import get_blob_shape
-
+from .utils.statistics_report import StatisticsReport
 
 class Benchmark:
-    def __init__(self, device: str, number_infer_requests, number_iterations, duration_seconds, api_type):
+    def __init__(self, device: str, number_infer_requests: int = None, number_iterations: int = None,
+                 duration_seconds: int = None, api_type: str = 'async'):
         self.device = device
         self.ie = IECore()
         self.nireq = number_infer_requests
         self.niter = number_iterations
         self.duration_seconds = get_duration_seconds(duration_seconds, self.niter, self.device)
         self.api_type = api_type
-        self.device_number_streams = {}
 
     def __del__(self):
         del self.ie
 
     def add_extension(self, path_to_extension: str=None, path_to_cldnn_config: str=None):
-        if GPU_DEVICE_NAME in self.device:
-            if path_to_cldnn_config:
-                self.ie.set_config({'CONFIG_FILE': path_to_cldnn_config}, GPU_DEVICE_NAME)
-                logger.info('GPU extensions is loaded {}'.format(path_to_cldnn_config))
-        if CPU_DEVICE_NAME in self.device or MYRIAD_DEVICE_NAME in self.device:
-            if path_to_extension:
-                self.ie.add_extension(extension_path=path_to_extension, device_name=CPU_DEVICE_NAME)
-                logger.info('CPU extensions is loaded {}'.format(path_to_extension))
+        if path_to_cldnn_config:
+            self.ie.set_config({'CONFIG_FILE': path_to_cldnn_config}, GPU_DEVICE_NAME)
+            logger.info('GPU extensions is loaded {}'.format(path_to_cldnn_config))
+
+        if path_to_extension:
+            self.ie.add_extension(extension_path=path_to_extension, device_name=CPU_DEVICE_NAME)
+            logger.info('CPU extensions is loaded {}'.format(path_to_extension))
 
     def get_version_info(self) -> str:
         logger.info('InferenceEngine:\n{: <9}{:.<24} {}'.format('', 'API version', get_version()))
@@ -67,57 +66,13 @@ class Benchmark:
             logger.info('Resizing network to batch = {}'.format(batch_size))
             ie_network.reshape(new_shapes)
 
-    def set_config(self, number_streams: int, api_type: str = 'async',
-                   number_threads: int = None, infer_threads_pinning: int = None):
-        devices = parse_devices(self.device)
-        self.device_number_streams = parse_nstreams_value_per_device(devices, number_streams)
-        for device_name in  self.device_number_streams.keys():
-            key = device_name + "_THROUGHPUT_STREAMS"
-            supported_config_keys = self.ie.get_metric(device_name, 'SUPPORTED_CONFIG_KEYS')
-            if key not in supported_config_keys:
-                raise Exception("Device " + device_name + " doesn't support config key '" + key + "'! " +
-                                "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>");
-
-        for device in devices:
-            if device == CPU_DEVICE_NAME:  # CPU supports few special performance-oriented keys
-                # limit threading for CPU portion of inference
-                if number_threads:
-                    self.ie.set_config({'CPU_THREADS_NUM': str(number_threads)}, device)
-
-                if MULTI_DEVICE_NAME in self.device and GPU_DEVICE_NAME in self.device:
-                    self.ie.set_config({'CPU_BIND_THREAD': 'NO'}, CPU_DEVICE_NAME)
-                else:
-                    # pin threads for CPU portion of inference
-                    self.ie.set_config({'CPU_BIND_THREAD': infer_threads_pinning}, device)
-
-                # for CPU execution, more throughput-oriented execution via streams
-                # for pure CPU execution, more throughput-oriented execution via streams
-                if api_type == 'async':
-                    cpu_throughput = {'CPU_THROUGHPUT_STREAMS': 'CPU_THROUGHPUT_AUTO'}
-                    if device in self.device_number_streams.keys():
-                        cpu_throughput['CPU_THROUGHPUT_STREAMS'] = str(self.device_number_streams.get(device))
-                    self.ie.set_config(cpu_throughput, device)
-                    self.device_number_streams[device] = self.ie.get_config(device, 'CPU_THROUGHPUT_STREAMS')
-
-            elif device == GPU_DEVICE_NAME:
-                if api_type == 'async':
-                    gpu_throughput = {'GPU_THROUGHPUT_STREAMS': 'GPU_THROUGHPUT_AUTO'}
-                    if device in self.device_number_streams.keys():
-                        gpu_throughput['GPU_THROUGHPUT_STREAMS'] = str(self.device_number_streams.get(device))
-                    self.ie.set_config(gpu_throughput, device)
-                    self.device_number_streams[device] = self.ie.get_config(device, 'GPU_THROUGHPUT_STREAMS')
-
-                if MULTI_DEVICE_NAME in self.device and CPU_DEVICE_NAME in self.device:
-                    # multi-device execution with the CPU+GPU performs best with GPU trottling hint,
-                    # which releases another CPU thread (that is otherwise used by the GPU driver for active polling)
-                    self.ie.set_config({'CLDNN_PLUGIN_THROTTLE': '1'}, device)
-
-            elif device == MYRIAD_DEVICE_NAME:
-                self.ie.set_config({'LOG_LEVEL': 'LOG_INFO'}, MYRIAD_DEVICE_NAME)
+    def set_config(self, config = {}):
+        for device in config.keys():
+            self.ie.set_config(config[device], device)
 
     def read_network(self, path_to_model: str):
         xml_filename = os.path.abspath(path_to_model)
-        head, tail = os.path.splitext(xml_filename)
+        head, _ = os.path.splitext(xml_filename)
         bin_filename = os.path.abspath(head + BIN_EXTENSION)
 
         ie_network = self.ie.read_network(xml_filename, bin_filename)
@@ -129,15 +84,14 @@ class Benchmark:
 
         return ie_network
 
-    def load_network(self, ie_network: IENetwork, perf_counts: bool):
-        config = {'PERF_COUNT': ('YES' if perf_counts else 'NO')}
-
+    def load_network(self, ie_network: IENetwork, config = {}):
         exe_network = self.ie.load_network(ie_network,
                                            self.device,
                                            config=config,
                                            num_requests=1 if self.api_type == 'sync' else self.nireq or 0)
         # Number of requests
         self.nireq = len(exe_network.requests)
+
         return exe_network
 
     def infer(self, exe_network, batch_size, progress_bar=None):
index 1158bed..f2d4b01 100644 (file)
@@ -4,16 +4,15 @@ from datetime import datetime
 
 from openvino.tools.benchmark.benchmark import Benchmark
 from openvino.tools.benchmark.parameters import parse_args
-from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME
+from openvino.tools.benchmark.utils.constants import MULTI_DEVICE_NAME, HETERO_DEVICE_NAME, CPU_DEVICE_NAME, GPU_DEVICE_NAME, MYRIAD_DEVICE_NAME, BIN_EXTENSION
 from openvino.tools.benchmark.utils.inputs_filling import set_inputs
 from openvino.tools.benchmark.utils.logging import logger
 from openvino.tools.benchmark.utils.progress_bar import ProgressBar
 from openvino.tools.benchmark.utils.utils import next_step, config_network_inputs, get_number_iterations, \
     process_help_inference_string, print_perf_counters, dump_exec_graph, get_duration_in_milliseconds, \
-    get_command_line_arguments
+    get_command_line_arguments, parse_nstreams_value_per_device, parse_devices, load_config, dump_config
 from openvino.tools.benchmark.utils.statistics_report import StatisticsReport, averageCntReport, detailedCntReport
 
-
 def main():
     # ------------------------------ 1. Parsing and validating input arguments -------------------------------------
     next_step()
@@ -27,20 +26,42 @@ def run(args):
                             "Although the automatic selection usually provides a reasonable performance, "
                             "but it still may be non-optimal for some cases, for more information look at README. ")
 
+        command_line_arguments = get_command_line_arguments(sys.argv)
         if args.report_type:
           statistics = StatisticsReport(StatisticsReport.Config(args.report_type, args.report_folder))
-          statistics.add_parameters(StatisticsReport.Category.COMMAND_LINE_PARAMETERS, get_command_line_arguments(sys.argv))
+          statistics.add_parameters(StatisticsReport.Category.COMMAND_LINE_PARAMETERS, command_line_arguments)
+
+        def is_flag_set_in_command_line(flag):
+            return any(x.strip('-') == flag for x, y in command_line_arguments)
+
+        device_name = args.target_device
 
+        devices = parse_devices(device_name)
+        device_number_streams = parse_nstreams_value_per_device(devices, args.number_streams)
+
+        config = {}
+        if args.load_config:
+            load_config(args.load_config, config)
 
         # ------------------------------ 2. Loading Inference Engine ---------------------------------------------------
         next_step(step_id=2)
 
-        device_name = args.target_device.upper()
-
         benchmark = Benchmark(args.target_device, args.number_infer_requests,
                               args.number_iterations, args.time, args.api_type)
 
-        benchmark.add_extension(args.path_to_extension, args.path_to_cldnn_config)
+        ## CPU (MKLDNN) extensions
+        if CPU_DEVICE_NAME in device_name and args.path_to_extension:
+            benchmark.add_extension(path_to_extension=args.path_to_extension)
+
+        ## GPU (clDNN) Extensions
+        if GPU_DEVICE_NAME in device_name and args.path_to_cldnn_config:
+            if GPU_DEVICE_NAME not in config.keys():
+                config[GPU_DEVICE_NAME] = {}
+            config[GPU_DEVICE_NAME]['CONFIG_FILE'] = args.path_to_cldnn_config
+
+        if GPU_DEVICE_NAME in config.keys() and 'CONFIG_FILE' in config[GPU_DEVICE_NAME].keys():
+            cldnn_config = config[GPU_DEVICE_NAME]['CONFIG_FILE']
+            benchmark.add_extension(path_to_cldnn_config=cldnn_config)
 
         version = benchmark.get_version_info()
 
@@ -74,17 +95,89 @@ def run(args):
 
         # --------------------- 6. Setting device configuration --------------------------------------------------------
         next_step()
-        benchmark.set_config(args.number_streams, args.api_type, args.number_threads,
-                             args.infer_threads_pinning)
+
+        perf_counts = False
+        for device in devices:
+            if device not in config.keys():
+                config[device] = {}
+            ## Set performance counter
+            if is_flag_set_in_command_line('pc'):
+                ## set to user defined value
+                config[device]['PERF_COUNT'] = 'YES' if args.perf_counts else 'NO'
+            elif 'PERF_COUNT' in config[device].keys() and config[device]['PERF_COUNT'] == 'YES':
+                logger.warn("Performance counters for {} device is turned on. ".format(device) +
+                            "To print results use -pc option.")
+            elif args.report_type in [ averageCntReport, detailedCntReport ]:
+                logger.warn("Turn on performance counters for {} device ".format(device) +
+                            "since report type is {}.".format(args.report_type))
+                config[device]['PERF_COUNT'] = 'YES'
+            elif args.exec_graph_path is not None:
+                logger.warn("Turn on performance counters for {} device ".format(device) +
+                            "due to execution graph dumping.")
+                config[device]['PERF_COUNT'] = 'YES'
+            else:
+                ## set to default value
+                config[device]['PERF_COUNT'] = 'YES' if args.perf_counts else 'NO'
+            perf_counts = True if config[device]['PERF_COUNT'] == 'YES' else perf_counts
+
+            def set_throughput_streams():
+                key = device + "_THROUGHPUT_STREAMS"
+                if device in device_number_streams.keys():
+                    ## set to user defined value
+                    supported_config_keys = benchmark.ie.get_metric(device, 'SUPPORTED_CONFIG_KEYS')
+                    if key not in supported_config_keys:
+                        raise Exception("Device {} doesn't support config key '{}'! ".format(device, key) +
+                                        "Please specify -nstreams for correct devices in format  <dev1>:<nstreams1>,<dev2>:<nstreams2>")
+                    config[device][key] = device_number_streams[device]
+                elif key not in config[device].keys() and args.api_type == "async":
+                    logger.warn("-nstreams default value is determined automatically for {} device. ".format(device) +
+                                "Although the automatic selection usually provides a reasonable performance,"
+                                "but it still may be non-optimal for some cases, for more information look at README.")
+                    config[device][key] = device + "_THROUGHPUT_AUTO"
+                if key in config[device].keys():
+                    device_number_streams[device] = config[device][key]
+
+            if device == CPU_DEVICE_NAME: # CPU supports few special performance-oriented keys
+                # limit threading for CPU portion of inference
+                if args.number_threads and is_flag_set_in_command_line("nthreads"):
+                    config[device]['CPU_THREADS_NUM'] = str(args.number_threads)
+
+                if is_flag_set_in_command_line("enforcebf16") or is_flag_set_in_command_line("enforce_bfloat16"):
+                    config[device]['ENFORCE_BF16'] = 'YES' if args.enforce_bfloat16 else 'NO'
+
+                if is_flag_set_in_command_line('pin'):
+                    ## set to user defined value
+                    config[device]['CPU_BIND_THREAD'] = args.infer_threads_pinning
+                elif 'CPU_BIND_THREAD' not in config[device].keys():
+                    if MULTI_DEVICE_NAME in device_name and GPU_DEVICE_NAME in device_name:
+                        logger.warn("Turn off threads pinning for {}".format(device) +
+                                    "device since multi-scenario with GPU device is used.")
+                        config[device]['CPU_BIND_THREAD'] = 'NO'
+                    else:
+                        ## set to default value
+                        config[device]['CPU_BIND_THREAD'] = args.infer_threads_pinning
+
+                ## for CPU execution, more throughput-oriented execution via streams
+                set_throughput_streams()
+            elif device == GPU_DEVICE_NAME:
+                ## for GPU execution, more throughput-oriented execution via streams
+                set_throughput_streams()
+
+                if MULTI_DEVICE_NAME in device_name and CPU_DEVICE_NAME in device_name:
+                    logger.warn("Turn on GPU trottling. Multi-device execution with the CPU + GPU performs best with GPU trottling hint, " +
+                                "which releases another CPU thread (that is otherwise used by the GPU driver for active polling)")
+                    config[device]['CLDNN_PLUGIN_THROTTLE'] = '1'
+            elif device == MYRIAD_DEVICE_NAME:
+                config[device]['LOG_LEVEL'] = 'LOG_INFO'
+        perf_counts = perf_counts
+
+        benchmark.set_config(config)
 
         # --------------------- 7. Loading the model to the device -----------------------------------------------------
         next_step()
 
         start_time = datetime.utcnow()
-        perf_counts = True if args.perf_counts or \
-                              args.report_type in [ averageCntReport, detailedCntReport ] or \
-                              args.exec_graph_path else False
-        exe_network = benchmark.load_network(ie_network, perf_counts)
+        exe_network = benchmark.load_network(ie_network)
         duration_ms = "{:.2f}".format((datetime.utcnow() - start_time).total_seconds() * 1000)
         logger.info("Load network took {} ms".format(duration_ms))
         if statistics:
@@ -92,6 +185,10 @@ def run(args):
                                       [
                                           ('load network time (ms)', duration_ms)
                                       ])
+        ## Update number of streams
+        for device in device_number_streams.keys():
+            key = device + '_THROUGHPUT_STREAMS'
+            device_number_streams[device] = benchmark.ie.get_config(device, key)
 
         # --------------------- 8. Setting optimal runtime parameters --------------------------------------------------
         next_step()
@@ -117,14 +214,14 @@ def run(args):
                                           ('topology', ie_network.name),
                                           ('target device', device_name),
                                           ('API', args.api_type),
-                                          ('precision', str(ie_network.precision)),
+                                          ('precision', "UNSPECIFIED"),
                                           ('batch size', str(batch_size)),
                                           ('number of iterations', str(benchmark.niter) if benchmark.niter else "0"),
                                           ('number of parallel infer requests', str(benchmark.nireq)),
                                           ('duration (ms)', str(get_duration_in_milliseconds(benchmark.duration_seconds))),
                                        ])
 
-            for nstreams in benchmark.device_number_streams.items():
+            for nstreams in device_number_streams.items():
                 statistics.add_parameters(StatisticsReport.Category.RUNTIME_CONFIG,
                                          [
                                             ("number of {} streams".format(nstreams[0]), str(nstreams[1])),
@@ -146,6 +243,10 @@ def run(args):
         # ------------------------------------ 11. Dumping statistics report -------------------------------------------
         next_step()
 
+        if args.dump_config:
+            dump_config(args.dump_config, config)
+            logger.info("Inference Engine configuration settings were dumped to {}".format(args.dump_config))
+
         if args.exec_graph_path:
             dump_exec_graph(exe_network, args.exec_graph_path)
 
index 44a7d37..f7c474f 100644 (file)
@@ -4,7 +4,6 @@ from fnmatch import fnmatch
 from openvino.tools.benchmark.utils.constants import XML_EXTENSION_PATTERN
 from openvino.tools.benchmark.utils.utils import show_available_devices
 
-
 def str2bool(v):
     if v.lower() in ('yes', 'true', 't', 'y', '1'):
         return True
@@ -77,18 +76,19 @@ def parse_args():
                            'Default value is determined automatically for a device. Please note that although the automatic selection '
                            'usually provides a reasonable performance, it still may be non - optimal for some cases, especially for very small networks. '
                            'See samples README for more details.')
-
+    args.add_argument('-enforcebf16', '--enforce_bfloat16', type=str2bool, required=False, default=False, nargs='?', const=True,
+                      help='Optional. Enforcing of floating point operations execution in bfloat16 precision where it is acceptable.')
     args.add_argument('-nthreads', '--number_threads', type=int, required=False, default=None,
                       help='Number of threads to use for inference on the CPU '
                            '(including HETERO and MULTI cases).')
     args.add_argument('-pin', '--infer_threads_pinning', type=str, required=False, default='YES', choices=['YES', 'NO', 'NUMA'],
                       help='Optional. Enable  threads->cores (\'YES\' is default value), threads->(NUMA)nodes (\'NUMA\') or completely  disable (\'NO\')' 
                            'CPU threads pinning for CPU-involved inference.')
-    args.add_argument('--exec_graph_path', type=str, required=False,
+    args.add_argument('-exec_graph_path', '--exec_graph_path', type=str, required=False,
                       help='Optional. Path to a file where to store executable graph information serialized.')
     args.add_argument('-pc', '--perf_counts', type=str2bool, required=False, default=False, nargs='?', const=True,
                       help='Optional. Report performance counters.', )
-    args.add_argument('--report_type', type=str, required=False,
+    args.add_argument('-report_type', '--report_type', type=str, required=False,
                       choices=['no_counters', 'average_counters', 'detailed_counters'],
                       help="Optional. Enable collecting statistics report. \"no_counters\" report contains "
                            "configuration options specified, resulting FPS and latency. \"average_counters\" "
@@ -96,8 +96,13 @@ def parse_args():
                            "counters values for each layer from the network. \"detailed_counters\" report "
                            "extends \"average_counters\" report and additionally includes per-layer PM "
                            "counters and latency for each executed infer request.")
-    args.add_argument('--report_folder', type=str, required=False, default='',
+    args.add_argument('-report_folder', '--report_folder', type=str, required=False, default='',
                       help="Optional. Path to a folder where statistics report is stored.")
+    args.add_argument('-dump_config', type=str, required=False, default='',
+                      help="Optional. Path to JSON file to dump IE parameters, which were set by application.")
+    args.add_argument('-load_config', type=str, required=False, default='',
+                      help="Optional. Path to JSON file to load custom IE parameters."
+                           " Please note, command line parameters have higher priority then parameters from configuration file.")
     parsed_args = parser.parse_args()
 
     validate_args(parsed_args)
index 834da5a..32da5e4 100644 (file)
@@ -20,6 +20,7 @@ from .constants import DEVICE_DURATION_IN_SECS, UNKNOWN_DEVICE_TYPE, DEVICE_NIRE
 from .inputs_filling import is_image
 from .logging import logger
 
+import json
 
 def static_vars(**kwargs):
     def decorate(func):
@@ -122,6 +123,8 @@ def get_nireq(target_device):
 
 
 def parse_devices(device_string):
+    if device_string in ['MULTI', 'HETERO']:
+        return list()
     devices = device_string
     if ':' in devices:
         devices = devices.partition(':')[2]
@@ -139,14 +142,14 @@ def parse_nstreams_value_per_device(devices, values_string):
         device_value_vec = device_value_string.split(':')
         if len(device_value_vec) == 2:
             device_name = device_value_vec[0]
-            nstreams = int(device_value_vec[1])
+            nstreams = device_value_vec[1]
             if device_name in devices:
                 result[device_name] = nstreams
             else:
                 raise Exception("Can't set nstreams value " + str(nstreams) +
                                 " for device '" + device_name + "'! Incorrect device name!");
         elif len(device_value_vec) == 1:
-            nstreams = int(device_value_vec[0])
+            nstreams = device_value_vec[0]
             for device in devices:
                 result[device] = nstreams
         elif not device_value_vec:
@@ -238,4 +241,12 @@ def get_command_line_arguments(argv):
 
 def show_available_devices():
     ie = IECore()
-    print("\nAvailable target devices:  ", ("  ".join(ie.available_devices)))
\ No newline at end of file
+    print("\nAvailable target devices:  ", ("  ".join(ie.available_devices)))
+
+def dump_config(filename, config):
+    with open(filename, 'w') as f:
+        json.dump(config, f, indent=4)
+
+def load_config(filename, config):
+    with open(filename) as f:
+        config.update(json.load(f))
\ No newline at end of file